cxgb_cpl_io.c revision 174712
1/************************************************************************** 2 3Copyright (c) 2007, Chelsio Inc. 4All rights reserved. 5 6Redistribution and use in source and binary forms, with or without 7modification, are permitted provided that the following conditions are met: 8 9 1. Redistributions of source code must retain the above copyright notice, 10 this list of conditions and the following disclaimer. 11 12 2. Neither the name of the Chelsio Corporation nor the names of its 13 contributors may be used to endorse or promote products derived from 14 this software without specific prior written permission. 15 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26POSSIBILITY OF SUCH DAMAGE. 27 28***************************************************************************/ 29 30#include <sys/cdefs.h> 31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 174712 2007-12-17 10:02:29Z kmacy $"); 32 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/fcntl.h> 36#include <sys/kernel.h> 37#include <sys/limits.h> 38#include <sys/lock.h> 39#include <sys/mbuf.h> 40#include <sys/mutex.h> 41#include <sys/socket.h> 42#include <sys/sysctl.h> 43#include <sys/syslog.h> 44#include <sys/socketvar.h> 45#include <sys/protosw.h> 46#include <sys/priv.h> 47 48#include <net/if.h> 49#include <net/route.h> 50 51#include <netinet/in.h> 52#include <netinet/in_pcb.h> 53#include <netinet/in_systm.h> 54#include <netinet/in_var.h> 55 56 57#include <dev/cxgb/cxgb_osdep.h> 58#include <dev/cxgb/sys/mbufq.h> 59 60#include <netinet/ip.h> 61#include <netinet/tcp_var.h> 62#include <netinet/tcp_fsm.h> 63#include <netinet/tcp_offload.h> 64#include <netinet/tcp_seq.h> 65#include <netinet/tcp_syncache.h> 66#include <net/route.h> 67 68 69#include <dev/cxgb/t3cdev.h> 70#include <dev/cxgb/common/cxgb_firmware_exports.h> 71#include <dev/cxgb/common/cxgb_t3_cpl.h> 72#include <dev/cxgb/common/cxgb_tcb.h> 73#include <dev/cxgb/common/cxgb_ctl_defs.h> 74#include <dev/cxgb/cxgb_l2t.h> 75#include <dev/cxgb/cxgb_offload.h> 76#include <vm/vm.h> 77#include <vm/pmap.h> 78#include <machine/bus.h> 79#include <dev/cxgb/sys/mvec.h> 80#include <dev/cxgb/ulp/toecore/cxgb_toedev.h> 81#include <dev/cxgb/ulp/tom/cxgb_defs.h> 82#include <dev/cxgb/ulp/tom/cxgb_tom.h> 83#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> 84#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> 85#include <dev/cxgb/ulp/tom/cxgb_tcp.h> 86 87 88 89/* 90 * For ULP connections HW may add headers, e.g., for digests, that aren't part 91 * of the messages sent by the host but that are part of the TCP payload and 92 * therefore consume TCP sequence space. Tx connection parameters that 93 * operate in TCP sequence space are affected by the HW additions and need to 94 * compensate for them to accurately track TCP sequence numbers. This array 95 * contains the compensating extra lengths for ULP packets. It is indexed by 96 * a packet's ULP submode. 97 */ 98const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; 99 100#ifdef notyet 101/* 102 * This sk_buff holds a fake header-only TCP segment that we use whenever we 103 * need to exploit SW TCP functionality that expects TCP headers, such as 104 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple 105 * CPUs without locking. 106 */ 107static struct mbuf *tcphdr_mbuf __read_mostly; 108#endif 109 110/* 111 * Size of WRs in bytes. Note that we assume all devices we are handling have 112 * the same WR size. 113 */ 114static unsigned int wrlen __read_mostly; 115 116/* 117 * The number of WRs needed for an skb depends on the number of page fragments 118 * in the skb and whether it has any payload in its main body. This maps the 119 * length of the gather list represented by an skb into the # of necessary WRs. 120 */ 121static unsigned int mbuf_wrs[TX_MAX_SEGS] __read_mostly; 122 123/* 124 * Max receive window supported by HW in bytes. Only a small part of it can 125 * be set through option0, the rest needs to be set through RX_DATA_ACK. 126 */ 127#define MAX_RCV_WND ((1U << 27) - 1) 128 129/* 130 * Min receive window. We want it to be large enough to accommodate receive 131 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. 132 */ 133#define MIN_RCV_WND (24 * 1024U) 134#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS) 135 136#define VALIDATE_SEQ 0 137#define VALIDATE_SOCK(so) 138#define DEBUG_WR 0 139 140extern int tcp_do_autorcvbuf; 141extern int tcp_do_autosndbuf; 142extern int tcp_autorcvbuf_max; 143extern int tcp_autosndbuf_max; 144 145static void t3_send_reset(struct toepcb *toep); 146static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); 147static inline void free_atid(struct t3cdev *cdev, unsigned int tid); 148static void handle_syncache_event(int event, void *arg); 149 150 151static inline int 152is_t3a(const struct toedev *dev) 153{ 154 return (dev->tod_ttid == TOE_ID_CHELSIO_T3); 155} 156 157static void 158dump_toepcb(struct toepcb *toep) 159{ 160 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n", 161 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode, 162 toep->tp_mtu_idx, toep->tp_tid); 163 164 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n", 165 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, 166 toep->tp_mss_clamp, toep->tp_flags); 167} 168 169static struct rtentry * 170rtalloc2(struct sockaddr *dst, int report, u_long ignflags) 171{ 172 struct rtentry *rt = NULL; 173 174 if ((rt = rtalloc1(dst, report, ignflags)) != NULL) 175 RT_UNLOCK(rt); 176 177 return (rt); 178} 179 180/* 181 * Determine whether to send a CPL message now or defer it. A message is 182 * deferred if the connection is in SYN_SENT since we don't know the TID yet. 183 * For connections in other states the message is sent immediately. 184 * If through_l2t is set the message is subject to ARP processing, otherwise 185 * it is sent directly. 186 */ 187static inline void 188send_or_defer(struct socket *so, struct tcpcb *tp, struct mbuf *m, int through_l2t) 189{ 190 struct toepcb *toep = tp->t_toe; 191 192 193 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) { 194 INP_LOCK(tp->t_inpcb); 195 mbufq_tail(&toep->out_of_order_queue, m); // defer 196 INP_UNLOCK(tp->t_inpcb); 197 } else if (through_l2t) 198 l2t_send(T3C_DEV(so), m, toep->tp_l2t); // send through L2T 199 else 200 cxgb_ofld_send(T3C_DEV(so), m); // send directly 201} 202 203static inline unsigned int 204mkprio(unsigned int cntrl, const struct socket *so) 205{ 206 return cntrl; 207} 208 209/* 210 * Populate a TID_RELEASE WR. The skb must be already propely sized. 211 */ 212static inline void 213mk_tid_release(struct mbuf *m, const struct socket *so, unsigned int tid) 214{ 215 struct cpl_tid_release *req; 216 217 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, so)); 218 m->m_pkthdr.len = m->m_len = sizeof(*req); 219 req = mtod(m, struct cpl_tid_release *); 220 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 221 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); 222} 223 224static inline void 225make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) 226{ 227 struct tcpcb *tp = sototcpcb(so); 228 struct toepcb *toep = tp->t_toe; 229 struct tx_data_wr *req; 230 231 INP_LOCK_ASSERT(tp->t_inpcb); 232 233 req = mtod(m, struct tx_data_wr *); 234 m->m_len = sizeof(*req); 235 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 236 req->wr_lo = htonl(V_WR_TID(toep->tp_tid)); 237 /* len includes the length of any HW ULP additions */ 238 req->len = htonl(len); 239 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); 240 /* V_TX_ULP_SUBMODE sets both the mode and submode */ 241 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) | 242 V_TX_URG(/* skb_urgent(skb) */ 0 ) | 243 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) && 244 (tail ? 0 : 1)))); 245 req->sndseq = htonl(tp->snd_nxt); 246 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { 247 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 248 V_TX_CPU_IDX(toep->tp_qset)); 249 250 /* Sendbuffer is in units of 32KB. 251 */ 252 if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) 253 req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15)); 254 else 255 req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15)); 256 toep->tp_flags |= TP_DATASENT; 257 } 258} 259 260int 261t3_push_frames(struct socket *so, int req_completion) 262{ 263 struct tcpcb *tp = sototcpcb(so); 264 struct toepcb *toep = tp->t_toe; 265 266 struct mbuf *tail, *m0, *last; 267 struct t3cdev *cdev; 268 struct tom_data *d; 269 int bytes, count, total_bytes; 270 bus_dma_segment_t segs[TX_MAX_SEGS], *segp; 271 segp = segs; 272 273 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) { 274 DPRINTF("tcp state=%d\n", tp->t_state); 275 return (0); 276 } 277 278 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) { 279 DPRINTF("disconnecting\n"); 280 281 return (0); 282 } 283 284 INP_LOCK_ASSERT(tp->t_inpcb); 285 286 SOCKBUF_LOCK(&so->so_snd); 287 288 d = TOM_DATA(TOE_DEV(so)); 289 cdev = d->cdev; 290 last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb; 291 total_bytes = 0; 292 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n", 293 toep->tp_wr_avail, tail, so->so_snd.sb_cc, toep->tp_m_last); 294 295 if (last && toep->tp_m_last == last && so->so_snd.sb_sndptroff != 0) { 296 KASSERT(tail, ("sbdrop error")); 297 last = tail = tail->m_next; 298 } 299 300 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) { 301 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail); 302 SOCKBUF_UNLOCK(&so->so_snd); 303 return (0); 304 } 305 306 toep->tp_m_last = NULL; 307 while (toep->tp_wr_avail && (tail != NULL)) { 308 count = bytes = 0; 309 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) { 310 SOCKBUF_UNLOCK(&so->so_snd); 311 return (0); 312 } 313 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) 314 && (tail != NULL) && (count < TX_MAX_SEGS)) { 315 bytes += tail->m_len; 316 count++; 317 last = tail; 318 /* 319 * technically an abuse to be using this for a VA 320 * but less gross than defining my own structure 321 * or calling pmap_kextract from here :-| 322 */ 323 segp->ds_addr = (bus_addr_t)tail->m_data; 324 segp->ds_len = tail->m_len; 325 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", 326 count, mbuf_wrs[count], tail->m_data, tail->m_len); 327 328 segp++; 329 tail = tail->m_next; 330 } 331 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", 332 toep->tp_wr_avail, count, mbuf_wrs[count], tail); 333 if (tail) { 334 so->so_snd.sb_sndptr = tail; 335 toep->tp_m_last = NULL; 336 } else 337 toep->tp_m_last = so->so_snd.sb_sndptr = last; 338 339 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last); 340 341 so->so_snd.sb_sndptroff += bytes; 342 total_bytes += bytes; 343 toep->tp_write_seq += bytes; 344 345 346 SOCKBUF_UNLOCK(&so->so_snd); 347 348 /* 349 * XXX can drop socket buffer lock here 350 */ 351 352 toep->tp_wr_avail -= mbuf_wrs[count]; 353 toep->tp_wr_unacked += mbuf_wrs[count]; 354 355 make_tx_data_wr(so, m0, bytes, tail); 356 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, so)); 357 m_set_sgl(m0, segs); 358 m_set_sgllen(m0, count); 359 /* 360 * remember credits used 361 */ 362 m0->m_pkthdr.csum_data = mbuf_wrs[count]; 363 m0->m_pkthdr.len = bytes; 364 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) || 365 toep->tp_wr_unacked >= toep->tp_wr_max / 2) { 366 struct work_request_hdr *wr = cplhdr(m0); 367 368 wr->wr_hi |= htonl(F_WR_COMPL); 369 toep->tp_wr_unacked = 0; 370 } 371 372 m0->m_type = MT_DONTFREE; 373 enqueue_wr(toep, m0); 374 DPRINTF("sending offload tx with %d bytes in %d segments\n", 375 bytes, count); 376 377 l2t_send(cdev, m0, toep->tp_l2t); 378 if (toep->tp_wr_avail && (tail != NULL)) 379 SOCKBUF_LOCK(&so->so_snd); 380 } 381 382 SOCKBUF_UNLOCK_ASSERT(&so->so_snd); 383 return (total_bytes); 384} 385 386/* 387 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail 388 * under any circumstances. We take the easy way out and always queue the 389 * message to the write_queue. We can optimize the case where the queue is 390 * already empty though the optimization is probably not worth it. 391 */ 392static void 393close_conn(struct socket *so) 394{ 395 struct mbuf *m; 396 struct cpl_close_con_req *req; 397 struct tom_data *d; 398 struct inpcb *inp = sotoinpcb(so); 399 struct tcpcb *tp; 400 struct toepcb *toep; 401 unsigned int tid; 402 403 404 INP_LOCK(inp); 405 tp = sototcpcb(so); 406 toep = tp->t_toe; 407 408 if (tp->t_state != TCPS_SYN_SENT) 409 t3_push_frames(so, 1); 410 411 if (toep->tp_flags & TP_FIN_SENT) { 412 INP_UNLOCK(inp); 413 return; 414 } 415 416 tid = toep->tp_tid; 417 418 d = TOM_DATA(toep->tp_toedev); 419 420 m = m_gethdr_nofail(sizeof(*req)); 421 422 toep->tp_flags |= TP_FIN_SENT; 423 req = mtod(m, struct cpl_close_con_req *); 424 425 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); 426 req->wr.wr_lo = htonl(V_WR_TID(tid)); 427 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 428 req->rsvd = htonl(toep->tp_write_seq); 429 INP_UNLOCK(inp); 430 /* 431 * XXX - need to defer shutdown while there is still data in the queue 432 * 433 */ 434 cxgb_ofld_send(d->cdev, m); 435 436} 437 438/* 439 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant 440 * and send it along. 441 */ 442static void 443abort_arp_failure(struct t3cdev *cdev, struct mbuf *m) 444{ 445 struct cpl_abort_req *req = cplhdr(m); 446 447 req->cmd = CPL_ABORT_NO_RST; 448 cxgb_ofld_send(cdev, m); 449} 450 451/* 452 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are 453 * permitted to return without sending the message in case we cannot allocate 454 * an sk_buff. Returns the number of credits sent. 455 */ 456uint32_t 457t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail) 458{ 459 struct mbuf *m; 460 struct cpl_rx_data_ack *req; 461 struct toepcb *toep = tp->t_toe; 462 struct toedev *tdev = toep->tp_toedev; 463 464 m = m_gethdr_nofail(sizeof(*req)); 465 466 DPRINTF("returning %u credits to HW\n", credits); 467 468 req = mtod(m, struct cpl_rx_data_ack *); 469 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 470 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 471 req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); 472 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toeptoso(toep))); 473 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 474 return (credits); 475} 476 477 478/* 479 * Set of states for which we should return RX credits. 480 */ 481#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) 482 483/* 484 * Called after some received data has been read. It returns RX credits 485 * to the HW for the amount of data processed. 486 */ 487void 488t3_cleanup_rbuf(struct tcpcb *tp) 489{ 490 struct toepcb *toep = tp->t_toe; 491 struct socket *so; 492 struct toedev *dev; 493 int dack_mode, must_send, read; 494 u32 thres, credits, dack = 0; 495 496 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || 497 (tp->t_state == TCPS_FIN_WAIT_2))) 498 return; 499 INP_LOCK_ASSERT(tp->t_inpcb); 500 501 so = tp->t_inpcb->inp_socket; 502 SOCKBUF_LOCK(&so->so_rcv); 503 read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc; 504 toep->tp_copied_seq += read; 505 toep->tp_enqueued_bytes -= read; 506 credits = toep->tp_copied_seq - toep->tp_rcv_wup; 507 SOCKBUF_UNLOCK(&so->so_rcv); 508 509 if (credits > so->so_rcv.sb_mbmax) 510 printf("copied_seq=%u rcv_wup=%u credits=%u\n", 511 toep->tp_copied_seq, toep->tp_rcv_wup, credits); 512 /* 513 * XXX this won't accurately reflect credit return - we need 514 * to look at the difference between the amount that has been 515 * put in the recv sockbuf and what is there now 516 */ 517 518 if (__predict_false(!credits)) 519 return; 520 521 dev = toep->tp_toedev; 522 thres = TOM_TUNABLE(dev, rx_credit_thres); 523 524 if (__predict_false(thres == 0)) 525 return; 526 527 if (toep->tp_ulp_mode) 528 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 529 else { 530 dack_mode = TOM_TUNABLE(dev, delack); 531 if (__predict_false(dack_mode != toep->tp_delack_mode)) { 532 u32 r = tp->rcv_nxt - toep->tp_delack_seq; 533 534 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp) 535 dack = F_RX_DACK_CHANGE | 536 V_RX_DACK_MODE(dack_mode); 537 } 538 } 539 540 /* 541 * For coalescing to work effectively ensure the receive window has 542 * at least 16KB left. 543 */ 544 must_send = credits + 16384 >= tp->rcv_wnd; 545 546 if (must_send || credits >= thres) 547 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send); 548} 549 550static int 551cxgb_toe_disconnect(struct tcpcb *tp) 552{ 553 struct socket *so; 554 555 DPRINTF("cxgb_toe_disconnect\n"); 556 557 so = tp->t_inpcb->inp_socket; 558 close_conn(so); 559 return (0); 560} 561 562static int 563cxgb_toe_reset(struct tcpcb *tp) 564{ 565 struct toepcb *toep = tp->t_toe; 566 567 568 t3_send_reset(toep); 569 570 /* 571 * unhook from socket 572 */ 573 tp->t_flags &= ~TF_TOE; 574 toep->tp_tp = NULL; 575 tp->t_toe = NULL; 576 return (0); 577} 578 579static int 580cxgb_toe_send(struct tcpcb *tp) 581{ 582 struct socket *so; 583 584 DPRINTF("cxgb_toe_send\n"); 585 dump_toepcb(tp->t_toe); 586 587 so = tp->t_inpcb->inp_socket; 588 t3_push_frames(so, 1); 589 return (0); 590} 591 592static int 593cxgb_toe_rcvd(struct tcpcb *tp) 594{ 595 INP_LOCK_ASSERT(tp->t_inpcb); 596 t3_cleanup_rbuf(tp); 597 598 return (0); 599} 600 601static void 602cxgb_toe_detach(struct tcpcb *tp) 603{ 604 struct toepcb *toep; 605 /* 606 * XXX how do we handle teardown in the SYN_SENT state? 607 * 608 */ 609 INP_INFO_WLOCK(&tcbinfo); 610 toep = tp->t_toe; 611 toep->tp_tp = NULL; 612 613 /* 614 * unhook from socket 615 */ 616 tp->t_flags &= ~TF_TOE; 617 tp->t_toe = NULL; 618 INP_INFO_WUNLOCK(&tcbinfo); 619} 620 621 622static struct toe_usrreqs cxgb_toe_usrreqs = { 623 .tu_disconnect = cxgb_toe_disconnect, 624 .tu_reset = cxgb_toe_reset, 625 .tu_send = cxgb_toe_send, 626 .tu_rcvd = cxgb_toe_rcvd, 627 .tu_detach = cxgb_toe_detach, 628 .tu_detach = cxgb_toe_detach, 629 .tu_syncache_event = handle_syncache_event, 630}; 631 632 633static void 634__set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word, 635 uint64_t mask, uint64_t val, int no_reply) 636{ 637 struct cpl_set_tcb_field *req; 638 struct tcpcb *tp = sototcpcb(so); 639 struct toepcb *toep = tp->t_toe; 640 641 req = mtod(m, struct cpl_set_tcb_field *); 642 m->m_pkthdr.len = m->m_len = sizeof(*req); 643 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 644 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid)); 645 req->reply = V_NO_REPLY(no_reply); 646 req->cpu_idx = 0; 647 req->word = htons(word); 648 req->mask = htobe64(mask); 649 req->val = htobe64(val); 650 651 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so)); 652 send_or_defer(so, tp, m, 0); 653} 654 655static void 656t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val) 657{ 658 struct mbuf *m; 659 struct tcpcb *tp = sototcpcb(so); 660 struct toepcb *toep = tp->t_toe; 661 662 if (toep == NULL) 663 return; 664 665 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) 666 return; 667 668 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field)); 669 670 __set_tcb_field(so, m, word, mask, val, 1); 671} 672 673/* 674 * Set one of the t_flags bits in the TCB. 675 */ 676static void 677set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val) 678{ 679 t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos); 680} 681 682/* 683 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. 684 */ 685static void 686t3_set_nagle(struct socket *so) 687{ 688 struct tcpcb *tp = sototcpcb(so); 689 690 set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY)); 691} 692 693/* 694 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. 695 */ 696void 697t3_set_keepalive(struct socket *so, int on_off) 698{ 699 set_tcb_tflag(so, S_TF_KEEPALIVE, on_off); 700} 701 702void 703t3_set_rcv_coalesce_enable(struct socket *so, int on_off) 704{ 705 set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off); 706} 707 708/* 709 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. 710 */ 711static void 712t3_set_tos(struct socket *so) 713{ 714 t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), 715 V_TCB_TOS(SO_TOS(so))); 716} 717 718 719/* 720 * In DDP mode, TP fails to schedule a timer to push RX data to the host when 721 * DDP is disabled (data is delivered to freelist). [Note that, the peer should 722 * set the PSH bit in the last segment, which would trigger delivery.] 723 * We work around the issue by setting a DDP buffer in a partial placed state, 724 * which guarantees that TP will schedule a timer. 725 */ 726#define TP_DDP_TIMER_WORKAROUND_MASK\ 727 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ 728 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ 729 V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) 730#define TP_DDP_TIMER_WORKAROUND_VAL\ 731 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ 732 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\ 733 32)) 734 735static void 736t3_enable_ddp(struct socket *so, int on) 737{ 738 if (on) 739 t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), 740 V_TF_DDP_OFF(0)); 741 else 742 t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, 743 V_TF_DDP_OFF(1) | 744 TP_DDP_TIMER_WORKAROUND_MASK, 745 V_TF_DDP_OFF(1) | 746 TP_DDP_TIMER_WORKAROUND_VAL); 747 748} 749 750 751void 752t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color) 753{ 754 t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx, 755 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 756 tag_color); 757} 758 759void 760t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset, 761 unsigned int len) 762{ 763 if (buf_idx == 0) 764 t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET, 765 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 766 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 767 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) | 768 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 769 else 770 t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET, 771 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 772 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), 773 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) | 774 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32)); 775} 776 777static int 778t3_set_cong_control(struct socket *so, const char *name) 779{ 780#ifdef notyet 781 int cong_algo; 782 783 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) 784 if (!strcmp(name, t3_cong_ops[cong_algo].name)) 785 break; 786 787 if (cong_algo >= ARRAY_SIZE(t3_cong_ops)) 788 return -EINVAL; 789#endif 790 return 0; 791} 792 793int 794t3_get_tcb(struct socket *so) 795{ 796 struct cpl_get_tcb *req; 797 struct tcpcb *tp = sototcpcb(so); 798 struct toepcb *toep = tp->t_toe; 799 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); 800 801 if (!m) 802 return (ENOMEM); 803 804 INP_LOCK_ASSERT(tp->t_inpcb); 805 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so)); 806 req = mtod(m, struct cpl_get_tcb *); 807 m->m_pkthdr.len = m->m_len = sizeof(*req); 808 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 809 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid)); 810 req->cpuno = htons(toep->tp_qset); 811 if (sototcpcb(so)->t_state == TCPS_SYN_SENT) 812 mbufq_tail(&toep->out_of_order_queue, m); // defer 813 else 814 cxgb_ofld_send(T3C_DEV(so), m); 815 return 0; 816} 817 818static inline void 819so_insert_tid(struct tom_data *d, struct socket *so, unsigned int tid) 820{ 821 struct toepcb *toep = sototoep(so); 822 toepcb_hold(toep); 823 824 cxgb_insert_tid(d->cdev, d->client, toep, tid); 825} 826 827/** 828 * find_best_mtu - find the entry in the MTU table closest to an MTU 829 * @d: TOM state 830 * @mtu: the target MTU 831 * 832 * Returns the index of the value in the MTU table that is closest to but 833 * does not exceed the target MTU. 834 */ 835static unsigned int 836find_best_mtu(const struct t3c_data *d, unsigned short mtu) 837{ 838 int i = 0; 839 840 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) 841 ++i; 842 return (i); 843} 844 845static unsigned int 846select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu) 847{ 848 unsigned int idx; 849 850#ifdef notyet 851 struct rtentry *dst = sotoinpcb(so)->inp_route.ro_rt; 852#endif 853 if (tp) { 854 tp->t_maxseg = pmtu - 40; 855 if (tp->t_maxseg < td->mtus[0] - 40) 856 tp->t_maxseg = td->mtus[0] - 40; 857 idx = find_best_mtu(td, tp->t_maxseg + 40); 858 859 tp->t_maxseg = td->mtus[idx] - 40; 860 } else 861 idx = find_best_mtu(td, pmtu); 862 863 return (idx); 864} 865 866void 867t3_release_ddp_resources(struct toepcb *toep) 868{ 869 /* 870 * This is a no-op until we have DDP support 871 */ 872} 873 874static inline void 875free_atid(struct t3cdev *cdev, unsigned int tid) 876{ 877 struct toepcb *toep = cxgb_free_atid(cdev, tid); 878 879 if (toep) 880 toepcb_release(toep); 881} 882 883/* 884 * Release resources held by an offload connection (TID, L2T entry, etc.) 885 */ 886static void 887t3_release_offload_resources(struct toepcb *toep) 888{ 889 struct tcpcb *tp = toep->tp_tp; 890 struct toedev *tdev = toep->tp_toedev; 891 struct t3cdev *cdev; 892 unsigned int tid = toep->tp_tid; 893 894 if (!tdev) 895 return; 896 897 cdev = TOEP_T3C_DEV(toep); 898 if (!cdev) 899 return; 900 901 toep->tp_qset = 0; 902 t3_release_ddp_resources(toep); 903 904#ifdef CTRL_SKB_CACHE 905 kfree_skb(CTRL_SKB_CACHE(tp)); 906 CTRL_SKB_CACHE(tp) = NULL; 907#endif 908 909 if (toep->tp_wr_avail != toep->tp_wr_max) { 910 purge_wr_queue(toep); 911 reset_wr_list(toep); 912 } 913 914 if (toep->tp_l2t) { 915 l2t_release(L2DATA(cdev), toep->tp_l2t); 916 toep->tp_l2t = NULL; 917 } 918 printf("setting toep->tp_tp to NULL\n"); 919 920 toep->tp_tp = NULL; 921 if (tp) { 922 INP_LOCK_ASSERT(tp->t_inpcb); 923 tp->t_toe = NULL; 924 tp->t_flags &= ~TF_TOE; 925 } 926 927 if (toep->tp_state == TCPS_SYN_SENT) { 928 free_atid(cdev, tid); 929#ifdef notyet 930 __skb_queue_purge(&tp->out_of_order_queue); 931#endif 932 } else { // we have TID 933 cxgb_remove_tid(cdev, toep, tid); 934 toepcb_release(toep); 935 } 936#if 0 937 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state); 938#endif 939} 940 941static void 942install_offload_ops(struct socket *so) 943{ 944 struct tcpcb *tp = sototcpcb(so); 945 946 KASSERT(tp->t_toe != NULL, ("toepcb not set")); 947 948 t3_install_socket_ops(so); 949 tp->t_flags |= TF_TOE; 950 tp->t_tu = &cxgb_toe_usrreqs; 951} 952 953/* 954 * Determine the receive window scaling factor given a target max 955 * receive window. 956 */ 957static __inline int 958select_rcv_wscale(int space) 959{ 960 int wscale = 0; 961 962 if (space > MAX_RCV_WND) 963 space = MAX_RCV_WND; 964 965 if (tcp_do_rfc1323) 966 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; 967 return wscale; 968} 969 970/* 971 * Determine the receive window size for a socket. 972 */ 973static unsigned int 974select_rcv_wnd(struct socket *so) 975{ 976 struct toedev *dev = TOE_DEV(so); 977 struct tom_data *d = TOM_DATA(dev); 978 unsigned int wnd; 979 unsigned int max_rcv_wnd; 980 981 if (tcp_do_autorcvbuf) 982 wnd = tcp_autorcvbuf_max; 983 else 984 wnd = sbspace(&so->so_rcv); 985 986 /* XXX 987 * For receive coalescing to work effectively we need a receive window 988 * that can accomodate a coalesced segment. 989 */ 990 if (wnd < MIN_RCV_WND) 991 wnd = MIN_RCV_WND; 992 993 /* PR 5138 */ 994 max_rcv_wnd = (dev->tod_ttid == TOE_ID_CHELSIO_T3B ? 995 (uint32_t)d->rx_page_size * 23 : 996 MAX_RCV_WND); 997 998 return min(wnd, max_rcv_wnd); 999} 1000 1001/* 1002 * Assign offload parameters to some socket fields. This code is used by 1003 * both active and passive opens. 1004 */ 1005static inline void 1006init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, 1007 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep) 1008{ 1009 struct tcpcb *tp = sototcpcb(so); 1010 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev); 1011 1012 SOCK_LOCK_ASSERT(so); 1013 1014 printf("initializing offload socket\n"); 1015 /* 1016 * We either need to fix push frames to work with sbcompress 1017 * or we need to add this 1018 */ 1019 so->so_snd.sb_flags |= SB_NOCOALESCE; 1020 1021 tp->t_toe = toep; 1022 toep->tp_tp = tp; 1023 toep->tp_toedev = dev; 1024 1025 toep->tp_tid = tid; 1026 toep->tp_l2t = e; 1027 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs); 1028 toep->tp_wr_unacked = 0; 1029 toep->tp_delack_mode = 0; 1030 1031 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu); 1032 /* 1033 * XXX broken 1034 * 1035 */ 1036 tp->rcv_wnd = select_rcv_wnd(so); 1037 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) && 1038 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 1039 toep->tp_qset_idx = 0; 1040 1041 reset_wr_list(toep); 1042 DPRINTF("initialization done\n"); 1043} 1044 1045/* 1046 * The next two functions calculate the option 0 value for a socket. 1047 */ 1048static inline unsigned int 1049calc_opt0h(struct socket *so, int mtu_idx) 1050{ 1051 struct tcpcb *tp = sototcpcb(so); 1052 int wscale = select_rcv_wscale(tp->rcv_wnd); 1053 1054 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) | 1055 V_KEEP_ALIVE((so->so_options & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS | 1056 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx); 1057} 1058 1059static inline unsigned int 1060calc_opt0l(struct socket *so, int ulp_mode) 1061{ 1062 struct tcpcb *tp = sototcpcb(so); 1063 unsigned int val; 1064 1065 val = V_TOS(SO_TOS(so)) | V_ULP_MODE(ulp_mode) | 1066 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ)); 1067 1068 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", SO_TOS(so), tp->rcv_wnd, val); 1069 return (val); 1070} 1071 1072static inline unsigned int 1073calc_opt2(const struct socket *so, struct toedev *dev) 1074{ 1075 int flv_valid; 1076 1077 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1); 1078 1079 return V_FLAVORS_VALID(flv_valid) | 1080 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0); 1081} 1082#if 0 1083(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) 1084#endif 1085 1086static void 1087mk_act_open_req(struct socket *so, struct mbuf *m, 1088 unsigned int atid, const struct l2t_entry *e) 1089{ 1090 struct cpl_act_open_req *req; 1091 struct inpcb *inp = sotoinpcb(so); 1092 struct tcpcb *tp = intotcpcb(inp); 1093 struct toepcb *toep = tp->t_toe; 1094 struct toedev *tdev = TOE_DEV(so); 1095 1096 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, so)); 1097 1098 req = mtod(m, struct cpl_act_open_req *); 1099 m->m_pkthdr.len = m->m_len = sizeof(*req); 1100 1101 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1102 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 1103 req->local_port = inp->inp_lport; 1104 req->peer_port = inp->inp_fport; 1105 memcpy(&req->local_ip, &inp->inp_laddr, 4); 1106 memcpy(&req->peer_ip, &inp->inp_faddr, 4); 1107 DPRINTF("connect smt_idx=%d\n", e->smt_idx); 1108 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) | 1109 V_TX_CHANNEL(e->smt_idx)); 1110 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode)); 1111 req->params = 0; 1112 req->opt2 = htonl(calc_opt2(so, tdev)); 1113} 1114 1115 1116/* 1117 * Convert an ACT_OPEN_RPL status to an errno. 1118 */ 1119static int 1120act_open_rpl_status_to_errno(int status) 1121{ 1122 switch (status) { 1123 case CPL_ERR_CONN_RESET: 1124 return (ECONNREFUSED); 1125 case CPL_ERR_ARP_MISS: 1126 return (EHOSTUNREACH); 1127 case CPL_ERR_CONN_TIMEDOUT: 1128 return (ETIMEDOUT); 1129 case CPL_ERR_TCAM_FULL: 1130 return (ENOMEM); 1131 case CPL_ERR_CONN_EXIST: 1132 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); 1133 return (EADDRINUSE); 1134 default: 1135 return (EIO); 1136 } 1137} 1138 1139static void 1140fail_act_open(struct toepcb *toep, int errno) 1141{ 1142 struct tcpcb *tp = toep->tp_tp; 1143 1144 t3_release_offload_resources(toep); 1145 if (tp) { 1146 INP_LOCK_ASSERT(tp->t_inpcb); 1147 cxgb_tcp_drop(tp, errno); 1148 } 1149 1150#ifdef notyet 1151 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 1152#endif 1153} 1154 1155/* 1156 * Handle active open failures. 1157 */ 1158static void 1159active_open_failed(struct toepcb *toep, struct mbuf *m) 1160{ 1161 struct cpl_act_open_rpl *rpl = cplhdr(m); 1162 struct inpcb *inp; 1163 1164 INP_INFO_WLOCK(&tcbinfo); 1165 if (toep->tp_tp == NULL) 1166 goto done; 1167 1168 inp = toep->tp_tp->t_inpcb; 1169 INP_LOCK(inp); 1170 1171/* 1172 * Don't handle connection retry for now 1173 */ 1174#ifdef notyet 1175 struct inet_connection_sock *icsk = inet_csk(sk); 1176 1177 if (rpl->status == CPL_ERR_CONN_EXIST && 1178 icsk->icsk_retransmit_timer.function != act_open_retry_timer) { 1179 icsk->icsk_retransmit_timer.function = act_open_retry_timer; 1180 sk_reset_timer(so, &icsk->icsk_retransmit_timer, 1181 jiffies + HZ / 2); 1182 } else 1183#endif 1184 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status)); 1185 INP_UNLOCK(inp); 1186done: 1187 INP_INFO_WUNLOCK(&tcbinfo); 1188 1189 m_free(m); 1190} 1191 1192/* 1193 * Return whether a failed active open has allocated a TID 1194 */ 1195static inline int 1196act_open_has_tid(int status) 1197{ 1198 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && 1199 status != CPL_ERR_ARP_MISS; 1200} 1201 1202/* 1203 * Process an ACT_OPEN_RPL CPL message. 1204 */ 1205static int 1206do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1207{ 1208 struct toepcb *toep = (struct toepcb *)ctx; 1209 struct cpl_act_open_rpl *rpl = cplhdr(m); 1210 1211 if (cdev->type != T3A && act_open_has_tid(rpl->status)) 1212 cxgb_queue_tid_release(cdev, GET_TID(rpl)); 1213 1214 active_open_failed(toep, m); 1215 return (0); 1216} 1217 1218/* 1219 * Handle an ARP failure for an active open. XXX purge ofo queue 1220 * 1221 * XXX badly broken for crossed SYNs as the ATID is no longer valid. 1222 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should 1223 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't 1224 * free the atid. Hmm. 1225 */ 1226#ifdef notyet 1227static void 1228act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m) 1229{ 1230 struct toepcb *toep = m_get_toep(m); 1231 struct tcpcb *tp = toep->tp_tp; 1232 struct inpcb *inp = tp->t_inpcb; 1233 struct socket *so = toeptoso(toep); 1234 1235 INP_LOCK(inp); 1236 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) { 1237 fail_act_open(so, EHOSTUNREACH); 1238 printf("freeing %p\n", m); 1239 1240 m_free(m); 1241 } 1242 INP_UNLOCK(inp); 1243} 1244#endif 1245/* 1246 * Send an active open request. 1247 */ 1248int 1249t3_connect(struct toedev *tdev, struct socket *so, 1250 struct rtentry *rt, struct sockaddr *nam) 1251{ 1252 struct mbuf *m; 1253 struct l2t_entry *e; 1254 struct tom_data *d = TOM_DATA(tdev); 1255 struct inpcb *inp = sotoinpcb(so); 1256 struct tcpcb *tp = intotcpcb(inp); 1257 struct toepcb *toep; /* allocated by init_offload_socket */ 1258 1259 int atid; 1260 1261 toep = toepcb_alloc(); 1262 if (toep == NULL) 1263 goto out_err; 1264 1265 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0) 1266 goto out_err; 1267 1268 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam); 1269 if (!e) 1270 goto free_tid; 1271 1272 INP_LOCK_ASSERT(inp); 1273 m = m_gethdr(MT_DATA, M_WAITOK); 1274 1275#if 0 1276 m->m_toe.mt_toepcb = tp->t_toe; 1277 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure); 1278#endif 1279 SOCK_LOCK(so); 1280 1281 init_offload_socket(so, tdev, atid, e, rt, toep); 1282 1283 install_offload_ops(so); 1284 1285 mk_act_open_req(so, m, atid, e); 1286 SOCK_UNLOCK(so); 1287 1288 soisconnecting(so); 1289 toep = tp->t_toe; 1290 m_set_toep(m, tp->t_toe); 1291 1292 printf("sending off request\n"); 1293 1294 toep->tp_state = TCPS_SYN_SENT; 1295 l2t_send(d->cdev, (struct mbuf *)m, e); 1296 1297 if (toep->tp_ulp_mode) 1298 t3_enable_ddp(so, 0); 1299 return (0); 1300 1301free_tid: 1302 printf("failing connect - free atid\n"); 1303 1304 free_atid(d->cdev, atid); 1305out_err: 1306 printf("return ENOMEM\n"); 1307 return (ENOMEM); 1308} 1309 1310/* 1311 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do 1312 * not send multiple ABORT_REQs for the same connection and also that we do 1313 * not try to send a message after the connection has closed. Returns 1 if 1314 * an ABORT_REQ wasn't generated after all, 0 otherwise. 1315 */ 1316static void 1317t3_send_reset(struct toepcb *toep) 1318{ 1319 1320 struct cpl_abort_req *req; 1321 unsigned int tid = toep->tp_tid; 1322 int mode = CPL_ABORT_SEND_RST; 1323 struct tcpcb *tp = toep->tp_tp; 1324 struct toedev *tdev = toep->tp_toedev; 1325 struct socket *so = NULL; 1326 struct mbuf *m; 1327 1328 if (tp) { 1329 INP_LOCK_ASSERT(tp->t_inpcb); 1330 so = toeptoso(toep); 1331 } 1332 1333 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) || 1334 tdev == NULL)) 1335 return; 1336 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN); 1337 1338 /* Purge the send queue so we don't send anything after an abort. */ 1339 if (so) 1340 sbflush(&so->so_snd); 1341 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev)) 1342 mode |= CPL_ABORT_POST_CLOSE_REQ; 1343 1344 m = m_gethdr_nofail(sizeof(*req)); 1345 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, so)); 1346 set_arp_failure_handler(m, abort_arp_failure); 1347 1348 req = mtod(m, struct cpl_abort_req *); 1349 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1350 req->wr.wr_lo = htonl(V_WR_TID(tid)); 1351 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1352 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0; 1353 req->rsvd1 = !(toep->tp_flags & TP_DATASENT); 1354 req->cmd = mode; 1355 if (tp && (tp->t_state == TCPS_SYN_SENT)) 1356 mbufq_tail(&toep->out_of_order_queue, m); // defer 1357 else 1358 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); 1359} 1360 1361static int 1362t3_ip_ctloutput(struct socket *so, struct sockopt *sopt) 1363{ 1364 struct inpcb *inp; 1365 int error, optval; 1366 1367 if (sopt->sopt_name == IP_OPTIONS) 1368 return (ENOPROTOOPT); 1369 1370 if (sopt->sopt_name != IP_TOS) 1371 return (EOPNOTSUPP); 1372 1373 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 1374 1375 if (error) 1376 return (error); 1377 1378 if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread)) 1379 return (EPERM); 1380 1381 inp = sotoinpcb(so); 1382 inp->inp_ip_tos = optval; 1383 1384 t3_set_tos(so); 1385 1386 return (0); 1387} 1388 1389static int 1390t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt) 1391{ 1392 int err = 0; 1393 size_t copied; 1394 1395 if (sopt->sopt_name != TCP_CONGESTION && 1396 sopt->sopt_name != TCP_NODELAY) 1397 return (EOPNOTSUPP); 1398 1399 if (sopt->sopt_name == TCP_CONGESTION) { 1400 char name[TCP_CA_NAME_MAX]; 1401 int optlen = sopt->sopt_valsize; 1402 struct tcpcb *tp; 1403 1404 if (optlen < 1) 1405 return (EINVAL); 1406 1407 err = copyinstr(sopt->sopt_val, name, 1408 min(TCP_CA_NAME_MAX - 1, optlen), &copied); 1409 if (err) 1410 return (err); 1411 if (copied < 1) 1412 return (EINVAL); 1413 1414 tp = sototcpcb(so); 1415 /* 1416 * XXX I need to revisit this 1417 */ 1418 if ((err = t3_set_cong_control(so, name)) == 0) { 1419#ifdef notyet 1420 tp->t_cong_control = strdup(name, M_CXGB); 1421#endif 1422 } else 1423 return (err); 1424 } else { 1425 int optval, oldval; 1426 struct inpcb *inp; 1427 struct tcpcb *tp; 1428 1429 err = sooptcopyin(sopt, &optval, sizeof optval, 1430 sizeof optval); 1431 1432 if (err) 1433 return (err); 1434 1435 inp = sotoinpcb(so); 1436 tp = intotcpcb(inp); 1437 1438 INP_LOCK(inp); 1439 1440 oldval = tp->t_flags; 1441 if (optval) 1442 tp->t_flags |= TF_NODELAY; 1443 else 1444 tp->t_flags &= ~TF_NODELAY; 1445 INP_UNLOCK(inp); 1446 1447 if (oldval != tp->t_flags) 1448 t3_set_nagle(so); 1449 1450 } 1451 1452 return (0); 1453} 1454 1455static int 1456t3_ctloutput(struct socket *so, struct sockopt *sopt) 1457{ 1458 int err; 1459 1460 if (sopt->sopt_level != IPPROTO_TCP) 1461 err = t3_ip_ctloutput(so, sopt); 1462 else 1463 err = t3_tcp_ctloutput(so, sopt); 1464 1465 if (err != EOPNOTSUPP) 1466 return (err); 1467 1468 return tcp_ctloutput(so, sopt); 1469} 1470 1471/* 1472 * Process new data received for a connection. 1473 */ 1474static void 1475new_rx_data(struct toepcb *toep, struct mbuf *m) 1476{ 1477 struct cpl_rx_data *hdr = cplhdr(m); 1478 struct tcpcb *tp = toep->tp_tp; 1479 struct socket *so = toeptoso(toep); 1480 int len = be16toh(hdr->len); 1481 1482 INP_LOCK(tp->t_inpcb); 1483 1484#ifdef notyet 1485 if (__predict_false(sk_no_receive(sk))) { 1486 handle_excess_rx(so, skb); 1487 return; 1488 } 1489 1490 if (ULP_MODE(tp) == ULP_MODE_TCPDDP) 1491 handle_ddp_data(so, skb); 1492 1493 TCP_SKB_CB(skb)->seq = ntohl(hdr->seq); 1494 TCP_SKB_CB(skb)->flags = 0; 1495 skb_ulp_mode(skb) = 0; /* for iSCSI */ 1496#endif 1497#if VALIDATE_SEQ 1498 if (__predict_false(TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) { 1499 printk(KERN_ERR 1500 "%s: TID %u: Bad sequence number %u, expected %u\n", 1501 TOE_DEV(sk)->name, TID(tp), TCP_SKB_CB(skb)->seq, 1502 tp->rcv_nxt); 1503 __kfree_skb(skb); 1504 return; 1505 } 1506#endif 1507 m_adj(m, sizeof(*hdr)); 1508 1509#ifdef notyet 1510 /* 1511 * We don't handle urgent data yet 1512 */ 1513 if (__predict_false(hdr->urg)) 1514 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg)); 1515 if (__predict_false(tp->urg_data == TCP_URG_NOTYET && 1516 tp->urg_seq - tp->rcv_nxt < skb->len)) 1517 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - 1518 tp->rcv_nxt]; 1519#endif 1520 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) { 1521 toep->tp_delack_mode = hdr->dack_mode; 1522 toep->tp_delack_seq = tp->rcv_nxt; 1523 } 1524 1525 DPRINTF("appending mbuf=%p pktlen=%d m_len=%d len=%d\n", m, m->m_pkthdr.len, m->m_len, len); 1526 1527 if (len < m->m_pkthdr.len) 1528 m->m_pkthdr.len = m->m_len = len; 1529 1530 tp->rcv_nxt += m->m_pkthdr.len; 1531 tp->t_rcvtime = ticks; 1532 toep->tp_enqueued_bytes += m->m_pkthdr.len; 1533#ifdef T3_TRACE 1534 T3_TRACE2(TIDTB(sk), 1535 "new_rx_data: seq 0x%x len %u", 1536 TCP_SKB_CB(skb)->seq, skb->len); 1537#endif 1538 SOCKBUF_LOCK(&so->so_rcv); 1539 if (sb_notify(&so->so_rcv)) 1540 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len); 1541 1542 sbappend_locked(&so->so_rcv, m); 1543 KASSERT(so->so_rcv.sb_cc < so->so_rcv.sb_mbmax, 1544 1545 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d", 1546 so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax)); 1547 1548 INP_UNLOCK(tp->t_inpcb); 1549 DPRINTF("sb_cc=%d sb_mbcnt=%d\n", 1550 so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt); 1551 1552 if (__predict_true((so->so_state & SS_NOFDREF) == 0)) 1553 sorwakeup_locked(so); 1554 else 1555 SOCKBUF_UNLOCK(&so->so_rcv); 1556} 1557 1558/* 1559 * Handler for RX_DATA CPL messages. 1560 */ 1561static int 1562do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1563{ 1564 struct toepcb *toep = (struct toepcb *)ctx; 1565 1566 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len); 1567 1568 new_rx_data(toep, m); 1569 1570 return (0); 1571} 1572 1573static void 1574new_rx_data_ddp(struct socket *so, struct mbuf *m) 1575{ 1576 struct tcpcb *tp = sototcpcb(so); 1577 struct toepcb *toep = tp->t_toe; 1578 struct ddp_state *q; 1579 struct ddp_buf_state *bsp; 1580 struct cpl_rx_data_ddp *hdr; 1581 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; 1582 1583#ifdef notyet 1584 if (unlikely(sk_no_receive(sk))) { 1585 handle_excess_rx(so, m); 1586 return; 1587 } 1588#endif 1589 tp = sototcpcb(so); 1590 q = &toep->tp_ddp_state; 1591 hdr = cplhdr(m); 1592 ddp_report = ntohl(hdr->u.ddp_report); 1593 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 1594 bsp = &q->buf_state[buf_idx]; 1595 1596#ifdef T3_TRACE 1597 T3_TRACE5(TIDTB(sk), 1598 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " 1599 "hdr seq 0x%x len %u offset %u", 1600 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), 1601 ntohs(hdr->len), G_DDP_OFFSET(ddp_report)); 1602 T3_TRACE1(TIDTB(sk), 1603 "new_rx_data_ddp: ddp_report 0x%x", 1604 ddp_report); 1605#endif 1606 1607 ddp_len = ntohs(hdr->len); 1608 rcv_nxt = ntohl(hdr->seq) + ddp_len; 1609 1610 /* 1611 * Overload to store old rcv_next 1612 */ 1613 m->m_pkthdr.csum_data = tp->rcv_nxt; 1614 tp->rcv_nxt = rcv_nxt; 1615 1616 /* 1617 * Store the length in m->m_len. We are changing the meaning of 1618 * m->m_len here, we need to be very careful that nothing from now on 1619 * interprets ->len of this packet the usual way. 1620 */ 1621 m->m_len = tp->rcv_nxt - m->m_pkthdr.csum_data; 1622 1623 /* 1624 * Figure out where the new data was placed in the buffer and store it 1625 * in when. Assumes the buffer offset starts at 0, consumer needs to 1626 * account for page pod's pg_offset. 1627 */ 1628 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; 1629#ifdef notyet 1630 TCP_SKB_CB(skb)->when = end_offset - skb->len; 1631 1632 /* 1633 * We store in mac.raw the address of the gather list where the 1634 * placement happened. 1635 */ 1636 skb->mac.raw = (unsigned char *)bsp->gl; 1637#endif 1638 bsp->cur_offset = end_offset; 1639 1640 /* 1641 * Bit 0 of flags stores whether the DDP buffer is completed. 1642 * Note that other parts of the code depend on this being in bit 0. 1643 */ 1644 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { 1645#if 0 1646 TCP_SKB_CB(skb)->flags = 0; /* potential spurious completion */ 1647#endif 1648 panic("spurious ddp completion"); 1649 } else { 1650 m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); 1651 if (m->m_pkthdr.csum_flags && !(bsp->flags & DDP_BF_NOFLIP)) 1652 q->cur_buf ^= 1; /* flip buffers */ 1653 } 1654 1655 if (bsp->flags & DDP_BF_NOCOPY) { 1656 m->m_pkthdr.csum_flags |= (bsp->flags & DDP_BF_NOCOPY); 1657 bsp->flags &= ~DDP_BF_NOCOPY; 1658 } 1659 1660 if (ddp_report & F_DDP_PSH) 1661 m->m_pkthdr.csum_flags |= DDP_BF_PSH; 1662 1663 tp->t_rcvtime = ticks; 1664 sbappendstream_locked(&so->so_rcv, m); 1665#ifdef notyet 1666 if (!sock_flag(sk, SOCK_DEAD)) 1667 sk->sk_data_ready(sk, 0); 1668#endif 1669} 1670 1671#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 1672 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 1673 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 1674 F_DDP_INVALID_PPOD) 1675 1676/* 1677 * Handler for RX_DATA_DDP CPL messages. 1678 */ 1679static int 1680do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1681{ 1682 struct toepcb *toep = ctx; 1683 struct socket *so = toeptoso(toep); 1684 const struct cpl_rx_data_ddp *hdr = cplhdr(m); 1685 1686 VALIDATE_SOCK(so); 1687 1688 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { 1689 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", 1690 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); 1691 return CPL_RET_BUF_DONE; 1692 } 1693#if 0 1694 skb->h.th = tcphdr_skb->h.th; 1695#endif 1696 new_rx_data_ddp(so, m); 1697 return (0); 1698} 1699 1700static void 1701process_ddp_complete(struct socket *so, struct mbuf *m) 1702{ 1703 struct tcpcb *tp = sototcpcb(so); 1704 struct toepcb *toep = tp->t_toe; 1705 struct ddp_state *q; 1706 struct ddp_buf_state *bsp; 1707 struct cpl_rx_ddp_complete *hdr; 1708 unsigned int ddp_report, buf_idx, when; 1709 1710#ifdef notyet 1711 if (unlikely(sk_no_receive(sk))) { 1712 handle_excess_rx(sk, skb); 1713 return; 1714 } 1715#endif 1716 q = &toep->tp_ddp_state; 1717 hdr = cplhdr(m); 1718 ddp_report = ntohl(hdr->ddp_report); 1719 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 1720 bsp = &q->buf_state[buf_idx]; 1721 1722 when = bsp->cur_offset; 1723 m->m_len = G_DDP_OFFSET(ddp_report) - when; 1724 1725#ifdef T3_TRACE 1726 T3_TRACE5(TIDTB(sk), 1727 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 1728 "ddp_report 0x%x offset %u, len %u", 1729 tp->rcv_nxt, bsp->cur_offset, ddp_report, 1730 G_DDP_OFFSET(ddp_report), skb->len); 1731#endif 1732 1733 bsp->cur_offset += m->m_len; 1734 1735 if (!(bsp->flags & DDP_BF_NOFLIP)) 1736 q->cur_buf ^= 1; /* flip buffers */ 1737 1738#ifdef T3_TRACE 1739 T3_TRACE4(TIDTB(sk), 1740 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 1741 "ddp_report %u offset %u", 1742 tp->rcv_nxt, bsp->cur_offset, ddp_report, 1743 G_DDP_OFFSET(ddp_report)); 1744#endif 1745#if 0 1746 skb->mac.raw = (unsigned char *)bsp->gl; 1747#endif 1748 m->m_pkthdr.csum_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; 1749 if (bsp->flags & DDP_BF_NOCOPY) 1750 bsp->flags &= ~DDP_BF_NOCOPY; 1751 m->m_pkthdr.csum_data = tp->rcv_nxt; 1752 tp->rcv_nxt += m->m_len; 1753 1754 tp->t_rcvtime = ticks; 1755 sbappendstream_locked(&so->so_rcv, m); 1756#ifdef notyet 1757 if (!sock_flag(sk, SOCK_DEAD)) 1758 sk->sk_data_ready(sk, 0); 1759#endif 1760} 1761 1762/* 1763 * Handler for RX_DDP_COMPLETE CPL messages. 1764 */ 1765static int 1766do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1767{ 1768 struct toepcb *toep = ctx; 1769 struct socket *so = toeptoso(toep); 1770 1771 VALIDATE_SOCK(so); 1772#if 0 1773 skb->h.th = tcphdr_skb->h.th; 1774#endif 1775 process_ddp_complete(so, m); 1776 return (0); 1777} 1778 1779/* 1780 * Move a socket to TIME_WAIT state. We need to make some adjustments to the 1781 * socket state before calling tcp_time_wait to comply with its expectations. 1782 */ 1783static void 1784enter_timewait(struct socket *so) 1785{ 1786 struct tcpcb *tp = sototcpcb(so); 1787 1788 INP_LOCK_ASSERT(tp->t_inpcb); 1789 /* 1790 * Bump rcv_nxt for the peer FIN. We don't do this at the time we 1791 * process peer_close because we don't want to carry the peer FIN in 1792 * the socket's receive queue and if we increment rcv_nxt without 1793 * having the FIN in the receive queue we'll confuse facilities such 1794 * as SIOCINQ. 1795 */ 1796 tp->rcv_nxt++; 1797 1798 tp->ts_recent_age = 0; /* defeat recycling */ 1799 tp->t_srtt = 0; /* defeat tcp_update_metrics */ 1800 tcp_twstart(tp); 1801} 1802 1803/* 1804 * Handle a peer FIN. 1805 */ 1806static void 1807do_peer_fin(struct socket *so, struct mbuf *m) 1808{ 1809 struct tcpcb *tp = sototcpcb(so); 1810 struct toepcb *toep = tp->t_toe; 1811 int keep = 0, dead = (so->so_state & SS_NOFDREF); 1812 1813 DPRINTF("do_peer_fin state=%d dead=%d\n", tp->t_state, !!dead); 1814 1815#ifdef T3_TRACE 1816 T3_TRACE0(TIDTB(sk),"do_peer_fin:"); 1817#endif 1818 1819 if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 1820 printf("abort_pending set\n"); 1821 1822 goto out; 1823 } 1824 1825#ifdef notyet 1826 if (ULP_MODE(tp) == ULP_MODE_TCPDDP) { 1827 keep = handle_peer_close_data(so, skb); 1828 if (keep < 0) 1829 return; 1830 } 1831 sk->sk_shutdown |= RCV_SHUTDOWN; 1832 sock_set_flag(so, SOCK_DONE); 1833#endif 1834 INP_INFO_WLOCK(&tcbinfo); 1835 INP_LOCK(tp->t_inpcb); 1836 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) 1837 socantrcvmore(so); 1838 switch (tp->t_state) { 1839 case TCPS_SYN_RECEIVED: 1840 tp->t_starttime = ticks; 1841 /* FALLTHROUGH */ 1842 case TCPS_ESTABLISHED: 1843 tp->t_state = TCPS_CLOSE_WAIT; 1844 break; 1845 case TCPS_FIN_WAIT_1: 1846 tp->t_state = TCPS_CLOSING; 1847 break; 1848 case TCPS_FIN_WAIT_2: 1849 /* 1850 * If we've sent an abort_req we must have sent it too late, 1851 * HW will send us a reply telling us so, and this peer_close 1852 * is really the last message for this connection and needs to 1853 * be treated as an abort_rpl, i.e., transition the connection 1854 * to TCP_CLOSE (note that the host stack does this at the 1855 * time of generating the RST but we must wait for HW). 1856 * Otherwise we enter TIME_WAIT. 1857 */ 1858 t3_release_offload_resources(toep); 1859 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 1860 tp = tcp_close(tp); 1861 } else 1862 enter_timewait(so); 1863 break; 1864 default: 1865 log(LOG_ERR, 1866 "%s: TID %u received PEER_CLOSE in bad state %d\n", 1867 TOE_DEV(so)->tod_name, toep->tp_tid, tp->t_state); 1868 } 1869 INP_INFO_WUNLOCK(&tcbinfo); 1870 if (tp) 1871 INP_UNLOCK(tp->t_inpcb); 1872 1873 if (!dead) { 1874 DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags); 1875 1876 sorwakeup(so); 1877 sowwakeup(so); 1878 wakeup(&so->so_timeo); 1879#ifdef notyet 1880 sk->sk_state_change(sk); 1881 1882 /* Do not send POLL_HUP for half duplex close. */ 1883 if ((sk->sk_shutdown & SEND_SHUTDOWN) || 1884 sk->sk_state == TCP_CLOSE) 1885 sk_wake_async(so, 1, POLL_HUP); 1886 else 1887 sk_wake_async(so, 1, POLL_IN); 1888#endif 1889 } 1890out: 1891 if (!keep) 1892 m_free(m); 1893} 1894 1895/* 1896 * Handler for PEER_CLOSE CPL messages. 1897 */ 1898static int 1899do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1900{ 1901 struct toepcb *toep = (struct toepcb *)ctx; 1902 struct socket *so = toeptoso(toep); 1903 1904 VALIDATE_SOCK(so); 1905 1906 do_peer_fin(so, m); 1907 return (0); 1908} 1909 1910static void 1911process_close_con_rpl(struct socket *so, struct mbuf *m) 1912{ 1913 struct tcpcb *tp = sototcpcb(so); 1914 struct cpl_close_con_rpl *rpl = cplhdr(m); 1915 struct toepcb *toep = tp->t_toe; 1916 1917 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ 1918 1919 DPRINTF("process_close_con_rpl(%p) state=%d dead=%d\n", so, tp->t_state, 1920 !!(so->so_state & SS_NOFDREF)); 1921 if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) 1922 goto out; 1923 1924 INP_INFO_WLOCK(&tcbinfo); 1925 INP_LOCK(tp->t_inpcb); 1926 switch (tp->t_state) { 1927 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ 1928 t3_release_offload_resources(toep); 1929 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 1930 tp = tcp_close(tp); 1931 1932 } else 1933 enter_timewait(so); 1934 break; 1935 case TCPS_LAST_ACK: 1936 /* 1937 * In this state we don't care about pending abort_rpl. 1938 * If we've sent abort_req it was post-close and was sent too 1939 * late, this close_con_rpl is the actual last message. 1940 */ 1941 t3_release_offload_resources(toep); 1942 tp = tcp_close(tp); 1943 break; 1944 case TCPS_FIN_WAIT_1: 1945#ifdef notyet 1946 dst_confirm(sk->sk_dst_cache); 1947#endif 1948 soisdisconnecting(so); 1949 1950 if ((so->so_state & SS_NOFDREF) == 0) { 1951 /* 1952 * Wake up lingering close 1953 */ 1954 sowwakeup(so); 1955 sorwakeup(so); 1956 wakeup(&so->so_timeo); 1957 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0 && 1958 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { 1959 tp = cxgb_tcp_drop(tp, 0); 1960 } 1961 1962 break; 1963 default: 1964 log(LOG_ERR, 1965 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", 1966 TOE_DEV(so)->tod_name, toep->tp_tid, 1967 tp->t_state); 1968 } 1969 INP_INFO_WUNLOCK(&tcbinfo); 1970 if (tp) 1971 INP_UNLOCK(tp->t_inpcb); 1972out: 1973 m_free(m); 1974} 1975 1976/* 1977 * Handler for CLOSE_CON_RPL CPL messages. 1978 */ 1979static int 1980do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m, 1981 void *ctx) 1982{ 1983 struct toepcb *toep = (struct toepcb *)ctx; 1984 struct socket *so = toeptoso(toep); 1985 1986 VALIDATE_SOCK(so); 1987 1988 process_close_con_rpl(so, m); 1989 return (0); 1990} 1991 1992/* 1993 * Process abort replies. We only process these messages if we anticipate 1994 * them as the coordination between SW and HW in this area is somewhat lacking 1995 * and sometimes we get ABORT_RPLs after we are done with the connection that 1996 * originated the ABORT_REQ. 1997 */ 1998static void 1999process_abort_rpl(struct socket *so, struct mbuf *m) 2000{ 2001 struct tcpcb *tp = sototcpcb(so); 2002 struct toepcb *toep = tp->t_toe; 2003 2004#ifdef T3_TRACE 2005 T3_TRACE1(TIDTB(sk), 2006 "process_abort_rpl: GTS rpl pending %d", 2007 sock_flag(sk, ABORT_RPL_PENDING)); 2008#endif 2009 INP_LOCK(tp->t_inpcb); 2010 2011 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2012 /* 2013 * XXX panic on tcpdrop 2014 */ 2015 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(TOE_DEV(so))) 2016 toep->tp_flags |= TP_ABORT_RPL_RCVD; 2017 else { 2018 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING); 2019 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) || 2020 !is_t3a(TOE_DEV(so))) { 2021 if (toep->tp_flags & TP_ABORT_REQ_RCVD) 2022 panic("TP_ABORT_REQ_RCVD set"); 2023 INP_INFO_WLOCK(&tcbinfo); 2024 INP_LOCK(tp->t_inpcb); 2025 t3_release_offload_resources(toep); 2026 tp = tcp_close(tp); 2027 INP_INFO_WUNLOCK(&tcbinfo); 2028 } 2029 } 2030 } 2031 if (tp) 2032 INP_UNLOCK(tp->t_inpcb); 2033 2034 m_free(m); 2035} 2036 2037/* 2038 * Handle an ABORT_RPL_RSS CPL message. 2039 */ 2040static int 2041do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2042{ 2043 struct socket *so; 2044 struct cpl_abort_rpl_rss *rpl = cplhdr(m); 2045 struct toepcb *toep; 2046 2047 /* 2048 * Ignore replies to post-close aborts indicating that the abort was 2049 * requested too late. These connections are terminated when we get 2050 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss 2051 * arrives the TID is either no longer used or it has been recycled. 2052 */ 2053 if (rpl->status == CPL_ERR_ABORT_FAILED) { 2054discard: 2055 m_free(m); 2056 return (0); 2057 } 2058 2059 toep = (struct toepcb *)ctx; 2060 2061 /* 2062 * Sometimes we've already closed the socket, e.g., a post-close 2063 * abort races with ABORT_REQ_RSS, the latter frees the socket 2064 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, 2065 * but FW turns the ABORT_REQ into a regular one and so we get 2066 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. 2067 */ 2068 if (!toep) 2069 goto discard; 2070 2071 if (toep->tp_tp == NULL) { 2072 printf("removing tid for abort\n"); 2073 cxgb_remove_tid(cdev, toep, toep->tp_tid); 2074 if (toep->tp_l2t) 2075 l2t_release(L2DATA(cdev), toep->tp_l2t); 2076 2077 toepcb_release(toep); 2078 goto discard; 2079 } 2080 2081 printf("toep=%p\n", toep); 2082 printf("tp=%p\n", toep->tp_tp); 2083 2084 so = toeptoso(toep); /* <- XXX panic */ 2085 toepcb_hold(toep); 2086 process_abort_rpl(so, m); 2087 toepcb_release(toep); 2088 return (0); 2089} 2090 2091/* 2092 * Convert the status code of an ABORT_REQ into a Linux error code. Also 2093 * indicate whether RST should be sent in response. 2094 */ 2095static int 2096abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst) 2097{ 2098 struct tcpcb *tp = sototcpcb(so); 2099 2100 switch (abort_reason) { 2101 case CPL_ERR_BAD_SYN: 2102#if 0 2103 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through 2104#endif 2105 case CPL_ERR_CONN_RESET: 2106 // XXX need to handle SYN_RECV due to crossed SYNs 2107 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 2108 case CPL_ERR_XMIT_TIMEDOUT: 2109 case CPL_ERR_PERSIST_TIMEDOUT: 2110 case CPL_ERR_FINWAIT2_TIMEDOUT: 2111 case CPL_ERR_KEEPALIVE_TIMEDOUT: 2112#if 0 2113 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); 2114#endif 2115 return (ETIMEDOUT); 2116 default: 2117 return (EIO); 2118 } 2119} 2120 2121static inline void 2122set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd) 2123{ 2124 struct cpl_abort_rpl *rpl = cplhdr(m); 2125 2126 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); 2127 rpl->wr.wr_lo = htonl(V_WR_TID(tid)); 2128 m->m_len = m->m_pkthdr.len = sizeof(*rpl); 2129 2130 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); 2131 rpl->cmd = cmd; 2132} 2133 2134static void 2135send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m) 2136{ 2137 struct mbuf *reply_mbuf; 2138 struct cpl_abort_req_rss *req = cplhdr(m); 2139 2140 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl)); 2141 m_set_priority(m, CPL_PRIORITY_DATA); 2142 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl); 2143 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status); 2144 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2145 m_free(m); 2146} 2147 2148/* 2149 * Returns whether an ABORT_REQ_RSS message is a negative advice. 2150 */ 2151static inline int 2152is_neg_adv_abort(unsigned int status) 2153{ 2154 return status == CPL_ERR_RTX_NEG_ADVICE || 2155 status == CPL_ERR_PERSIST_NEG_ADVICE; 2156} 2157 2158static void 2159send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status) 2160{ 2161 struct mbuf *reply_mbuf; 2162 struct cpl_abort_req_rss *req = cplhdr(m); 2163 2164 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2165 2166 if (!reply_mbuf) { 2167 /* Defer the reply. Stick rst_status into req->cmd. */ 2168 req->status = rst_status; 2169 t3_defer_reply(m, tdev, send_deferred_abort_rpl); 2170 return; 2171 } 2172 2173 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA); 2174 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status); 2175 m_free(m); 2176 2177 /* 2178 * XXX need to sync with ARP as for SYN_RECV connections we can send 2179 * these messages while ARP is pending. For other connection states 2180 * it's not a problem. 2181 */ 2182 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2183} 2184 2185#ifdef notyet 2186static void 2187cleanup_syn_rcv_conn(struct socket *child, struct socket *parent) 2188{ 2189 UNIMPLEMENTED(); 2190#ifdef notyet 2191 struct request_sock *req = child->sk_user_data; 2192 2193 inet_csk_reqsk_queue_removed(parent, req); 2194 synq_remove(tcp_sk(child)); 2195 __reqsk_free(req); 2196 child->sk_user_data = NULL; 2197#endif 2198} 2199 2200 2201/* 2202 * Performs the actual work to abort a SYN_RECV connection. 2203 */ 2204static void 2205do_abort_syn_rcv(struct socket *child, struct socket *parent) 2206{ 2207 struct tcpcb *parenttp = sototcpcb(parent); 2208 struct tcpcb *childtp = sototcpcb(child); 2209 2210 /* 2211 * If the server is still open we clean up the child connection, 2212 * otherwise the server already did the clean up as it was purging 2213 * its SYN queue and the skb was just sitting in its backlog. 2214 */ 2215 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) { 2216 cleanup_syn_rcv_conn(child, parent); 2217 INP_INFO_WLOCK(&tcbinfo); 2218 INP_LOCK(childtp->t_inpcb); 2219 t3_release_offload_resources(childtp->t_toe); 2220 childtp = tcp_close(childtp); 2221 INP_INFO_WUNLOCK(&tcbinfo); 2222 if (childtp) 2223 INP_UNLOCK(childtp->t_inpcb); 2224 } 2225} 2226#endif 2227 2228/* 2229 * Handle abort requests for a SYN_RECV connection. These need extra work 2230 * because the socket is on its parent's SYN queue. 2231 */ 2232static int 2233abort_syn_rcv(struct socket *so, struct mbuf *m) 2234{ 2235 UNIMPLEMENTED(); 2236#ifdef notyet 2237 struct socket *parent; 2238 struct toedev *tdev = TOE_DEV(so); 2239 struct t3cdev *cdev = TOM_DATA(tdev)->cdev; 2240 struct socket *oreq = so->so_incomp; 2241 struct t3c_tid_entry *t3c_stid; 2242 struct tid_info *t; 2243 2244 if (!oreq) 2245 return -1; /* somehow we are not on the SYN queue */ 2246 2247 t = &(T3C_DATA(cdev))->tid_maps; 2248 t3c_stid = lookup_stid(t, oreq->ts_recent); 2249 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 2250 2251 SOCK_LOCK(parent); 2252 do_abort_syn_rcv(so, parent); 2253 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST); 2254 SOCK_UNLOCK(parent); 2255#endif 2256 return (0); 2257} 2258 2259/* 2260 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this 2261 * request except that we need to reply to it. 2262 */ 2263static void 2264process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev) 2265{ 2266 int rst_status = CPL_ABORT_NO_RST; 2267 const struct cpl_abort_req_rss *req = cplhdr(m); 2268 struct tcpcb *tp = sototcpcb(so); 2269 struct toepcb *toep = tp->t_toe; 2270 2271 INP_LOCK(tp->t_inpcb); 2272 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) { 2273 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN); 2274 m_free(m); 2275 goto skip; 2276 } 2277 2278 toep->tp_flags &= ~TP_ABORT_REQ_RCVD; 2279 /* 2280 * Three cases to consider: 2281 * a) We haven't sent an abort_req; close the connection. 2282 * b) We have sent a post-close abort_req that will get to TP too late 2283 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will 2284 * be ignored and the connection should be closed now. 2285 * c) We have sent a regular abort_req that will get to TP too late. 2286 * That will generate an abort_rpl with status 0, wait for it. 2287 */ 2288 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) || 2289 (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) { 2290 so->so_error = abort_status_to_errno(so, req->status, 2291 &rst_status); 2292#if 0 2293 if (!sock_flag(sk, SOCK_DEAD)) 2294 sk->sk_error_report(sk); 2295#endif 2296 /* 2297 * SYN_RECV needs special processing. If abort_syn_rcv() 2298 * returns 0 is has taken care of the abort. 2299 */ 2300 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m)) 2301 goto skip; 2302 2303 t3_release_offload_resources(toep); 2304 tp = tcp_close(tp); 2305 } 2306 if (tp) 2307 INP_UNLOCK(tp->t_inpcb); 2308 send_abort_rpl(m, tdev, rst_status); 2309 return; 2310 2311skip: 2312 INP_UNLOCK(tp->t_inpcb); 2313} 2314 2315/* 2316 * Handle an ABORT_REQ_RSS CPL message. 2317 */ 2318static int 2319do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2320{ 2321 const struct cpl_abort_req_rss *req = cplhdr(m); 2322 struct toepcb *toep = (struct toepcb *)ctx; 2323 struct socket *so; 2324 struct inpcb *inp; 2325 2326 if (is_neg_adv_abort(req->status)) { 2327 m_free(m); 2328 return (0); 2329 } 2330 2331 printf("aborting tid=%d\n", toep->tp_tid); 2332 2333 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) { 2334 cxgb_remove_tid(cdev, toep, toep->tp_tid); 2335 toep->tp_flags |= TP_ABORT_REQ_RCVD; 2336 printf("sending abort rpl\n"); 2337 2338 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST); 2339 printf("sent\n"); 2340 if (toep->tp_l2t) 2341 l2t_release(L2DATA(cdev), toep->tp_l2t); 2342 2343 /* 2344 * Unhook 2345 */ 2346 toep->tp_tp->t_toe = NULL; 2347 toep->tp_tp->t_flags &= ~TF_TOE; 2348 toep->tp_tp = NULL; 2349 /* 2350 * XXX need to call syncache_chkrst - but we don't 2351 * have a way of doing that yet 2352 */ 2353 toepcb_release(toep); 2354 printf("abort for unestablished connection :-(\n"); 2355 return (0); 2356 } 2357 if (toep->tp_tp == NULL) { 2358 printf("disconnected toepcb\n"); 2359 /* should be freed momentarily */ 2360 return (0); 2361 } 2362 2363 so = toeptoso(toep); 2364 inp = sotoinpcb(so); 2365 2366 VALIDATE_SOCK(so); 2367 toepcb_hold(toep); 2368 INP_INFO_WLOCK(&tcbinfo); 2369 process_abort_req(so, m, TOE_DEV(so)); 2370 INP_INFO_WUNLOCK(&tcbinfo); 2371 toepcb_release(toep); 2372 return (0); 2373} 2374#ifdef notyet 2375static void 2376pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m) 2377{ 2378 struct toedev *tdev = TOE_DEV(parent); 2379 2380 do_abort_syn_rcv(child, parent); 2381 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) { 2382 struct cpl_pass_accept_rpl *rpl = cplhdr(m); 2383 2384 rpl->opt0h = htonl(F_TCAM_BYPASS); 2385 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 2386 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 2387 } else 2388 m_free(m); 2389} 2390#endif 2391static void 2392handle_pass_open_arp_failure(struct socket *so, struct mbuf *m) 2393{ 2394 UNIMPLEMENTED(); 2395 2396#ifdef notyet 2397 struct t3cdev *cdev; 2398 struct socket *parent; 2399 struct socket *oreq; 2400 struct t3c_tid_entry *t3c_stid; 2401 struct tid_info *t; 2402 struct tcpcb *otp, *tp = sototcpcb(so); 2403 struct toepcb *toep = tp->t_toe; 2404 2405 /* 2406 * If the connection is being aborted due to the parent listening 2407 * socket going away there's nothing to do, the ABORT_REQ will close 2408 * the connection. 2409 */ 2410 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2411 m_free(m); 2412 return; 2413 } 2414 2415 oreq = so->so_incomp; 2416 otp = sototcpcb(oreq); 2417 2418 cdev = T3C_DEV(so); 2419 t = &(T3C_DATA(cdev))->tid_maps; 2420 t3c_stid = lookup_stid(t, otp->ts_recent); 2421 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 2422 2423 SOCK_LOCK(parent); 2424 pass_open_abort(so, parent, m); 2425 SOCK_UNLOCK(parent); 2426#endif 2427} 2428 2429/* 2430 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly 2431 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV 2432 * connection. 2433 */ 2434static void 2435pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m) 2436{ 2437 2438#ifdef notyet 2439 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 2440 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk); 2441#endif 2442 handle_pass_open_arp_failure(m_get_socket(m), m); 2443} 2444 2445/* 2446 * Populate a reject CPL_PASS_ACCEPT_RPL WR. 2447 */ 2448static void 2449mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf) 2450{ 2451 struct cpl_pass_accept_req *req = cplhdr(req_mbuf); 2452 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf); 2453 unsigned int tid = GET_TID(req); 2454 2455 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP); 2456 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 2457 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 2458 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet 2459 rpl->opt0h = htonl(F_TCAM_BYPASS); 2460 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 2461 rpl->opt2 = 0; 2462 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 2463} 2464 2465/* 2466 * Send a deferred reject to an accept request. 2467 */ 2468static void 2469reject_pass_request(struct toedev *tdev, struct mbuf *m) 2470{ 2471 struct mbuf *reply_mbuf; 2472 2473 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl)); 2474 mk_pass_accept_rpl(reply_mbuf, m); 2475 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2476 m_free(m); 2477} 2478 2479static void 2480handle_syncache_event(int event, void *arg) 2481{ 2482 struct toepcb *toep = arg; 2483 2484 switch (event) { 2485 case TOE_SC_ENTRY_PRESENT: 2486 /* 2487 * entry already exists - free toepcb 2488 * and l2t 2489 */ 2490 printf("syncache entry present\n"); 2491 toepcb_release(toep); 2492 break; 2493 case TOE_SC_DROP: 2494 /* 2495 * The syncache has given up on this entry 2496 * either it timed out, or it was evicted 2497 * we need to explicitly release the tid 2498 */ 2499 printf("syncache entry dropped\n"); 2500 toepcb_release(toep); 2501 break; 2502 default: 2503 log(LOG_ERR, "unknown syncache event %d\n", event); 2504 break; 2505 } 2506} 2507 2508static void 2509syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep) 2510{ 2511 struct in_conninfo inc; 2512 struct tcpopt to; 2513 struct tcphdr th; 2514 struct inpcb *inp; 2515 int mss, wsf, sack, ts; 2516 2517 bzero(&to, sizeof(struct tcpopt)); 2518 inp = sotoinpcb(lso); 2519 2520 /* 2521 * Fill out information for entering us into the syncache 2522 */ 2523 inc.inc_fport = th.th_sport = req->peer_port; 2524 inc.inc_lport = th.th_dport = req->local_port; 2525 toep->tp_iss = th.th_seq = req->rcv_isn; 2526 th.th_flags = TH_SYN; 2527 2528 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = ntohl(req->rcv_isn); 2529 2530 inc.inc_isipv6 = 0; 2531 inc.inc_len = 0; 2532 inc.inc_faddr.s_addr = req->peer_ip; 2533 inc.inc_laddr.s_addr = req->local_ip; 2534 2535 DPRINTF("syncache add of %d:%d %d:%d\n", 2536 ntohl(req->local_ip), ntohs(req->local_port), 2537 ntohl(req->peer_ip), ntohs(req->peer_port)); 2538 2539 mss = req->tcp_options.mss; 2540 wsf = req->tcp_options.wsf; 2541 ts = req->tcp_options.tstamp; 2542 sack = req->tcp_options.sack; 2543 to.to_mss = mss; 2544 to.to_wscale = wsf; 2545 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 2546 2547 INP_INFO_WLOCK(&tcbinfo); 2548 INP_LOCK(inp); 2549 syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep); 2550} 2551 2552 2553/* 2554 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket 2555 * lock held. Note that the sock here is a listening socket that is not owned 2556 * by the TOE. 2557 */ 2558static void 2559process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, 2560 struct listen_ctx *lctx) 2561{ 2562 int rt_flags; 2563 struct l2t_entry *e; 2564 struct iff_mac tim; 2565 struct mbuf *reply_mbuf, *ddp_mbuf = NULL; 2566 struct cpl_pass_accept_rpl *rpl; 2567 struct cpl_pass_accept_req *req = cplhdr(m); 2568 unsigned int tid = GET_TID(req); 2569 struct tom_data *d = TOM_DATA(tdev); 2570 struct t3cdev *cdev = d->cdev; 2571 struct tcpcb *tp = sototcpcb(so); 2572 struct toepcb *newtoep; 2573 struct rtentry *dst; 2574 struct sockaddr_in nam; 2575 struct t3c_data *td = T3C_DATA(cdev); 2576 2577 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2578 if (__predict_false(reply_mbuf == NULL)) { 2579 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 2580 t3_defer_reply(m, tdev, reject_pass_request); 2581 else { 2582 cxgb_queue_tid_release(cdev, tid); 2583 m_free(m); 2584 } 2585 DPRINTF("failed to get reply_mbuf\n"); 2586 2587 goto out; 2588 } 2589 2590 if (tp->t_state != TCPS_LISTEN) { 2591 DPRINTF("socket not in listen state\n"); 2592 2593 goto reject; 2594 } 2595 2596 tim.mac_addr = req->dst_mac; 2597 tim.vlan_tag = ntohs(req->vlan_tag); 2598 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { 2599 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n"); 2600 goto reject; 2601 } 2602 2603#ifdef notyet 2604 /* 2605 * XXX do route lookup to confirm that we're still listening on this 2606 * address 2607 */ 2608 if (ip_route_input(skb, req->local_ip, req->peer_ip, 2609 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev)) 2610 goto reject; 2611 rt_flags = ((struct rtable *)skb->dst)->rt_flags & 2612 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); 2613 dst_release(skb->dst); // done with the input route, release it 2614 skb->dst = NULL; 2615 2616 if ((rt_flags & RTF_LOCAL) == 0) 2617 goto reject; 2618#endif 2619 /* 2620 * XXX 2621 */ 2622 rt_flags = RTF_LOCAL; 2623 if ((rt_flags & RTF_LOCAL) == 0) 2624 goto reject; 2625 2626 /* 2627 * Calculate values and add to syncache 2628 */ 2629 2630 newtoep = toepcb_alloc(); 2631 if (newtoep == NULL) 2632 goto reject; 2633 2634 bzero(&nam, sizeof(struct sockaddr_in)); 2635 2636 nam.sin_len = sizeof(struct sockaddr_in); 2637 nam.sin_family = AF_INET; 2638 nam.sin_addr.s_addr =req->peer_ip; 2639 dst = rtalloc2((struct sockaddr *)&nam, 1, 0); 2640 2641 if (dst == NULL) { 2642 printf("failed to find route\n"); 2643 goto reject; 2644 } 2645 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev, 2646 (struct sockaddr *)&nam); 2647 if (e == NULL) { 2648 DPRINTF("failed to get l2t\n"); 2649 } 2650 /* 2651 * Point to our listen socket until accept 2652 */ 2653 newtoep->tp_tp = tp; 2654 newtoep->tp_flags = TP_SYN_RCVD; 2655 newtoep->tp_tid = tid; 2656 newtoep->tp_toedev = tdev; 2657 2658 printf("inserting tid=%d\n", tid); 2659 cxgb_insert_tid(cdev, d->client, newtoep, tid); 2660 SOCK_LOCK(so); 2661 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry); 2662 SOCK_UNLOCK(so); 2663 2664 2665 if (lctx->ulp_mode) { 2666 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2667 2668 if (!ddp_mbuf) 2669 newtoep->tp_ulp_mode = 0; 2670 else 2671 newtoep->tp_ulp_mode = lctx->ulp_mode; 2672 } 2673 2674 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure); 2675 2676 DPRINTF("adding request to syn cache\n"); 2677 2678 /* 2679 * XXX workaround for lack of syncache drop 2680 */ 2681 toepcb_hold(newtoep); 2682 syncache_add_accept_req(req, so, newtoep); 2683 2684 2685 2686 rpl = cplhdr(reply_mbuf); 2687 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl); 2688 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 2689 rpl->wr.wr_lo = 0; 2690 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 2691 rpl->opt2 = htonl(calc_opt2(so, tdev)); 2692 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 2693 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten 2694 2695 DPRINTF("accept smt_idx=%d\n", e->smt_idx); 2696 2697 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) | 2698 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); 2699 rpl->opt0l_status = htonl(calc_opt0l(so, lctx->ulp_mode) | 2700 CPL_PASS_OPEN_ACCEPT); 2701 2702 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status); 2703 2704 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, so)); 2705 2706#ifdef DEBUG_PRINT 2707 { 2708 int i; 2709 2710 DPRINTF("rpl:\n"); 2711 uint32_t *rplbuf = mtod(reply_mbuf, uint32_t *); 2712 2713 for (i = 0; i < sizeof(*rpl)/sizeof(uint32_t); i++) 2714 DPRINTF("[%d] %08x\n", i, rplbuf[i]); 2715 } 2716#endif 2717 2718 2719 l2t_send(cdev, reply_mbuf, e); 2720 m_free(m); 2721#ifdef notyet 2722 /* 2723 * XXX this call path has to be converted to not depend on sockets 2724 */ 2725 if (newtoep->tp_ulp_mode) 2726 __set_tcb_field(newso, ddp_mbuf, W_TCB_RX_DDP_FLAGS, 2727 V_TF_DDP_OFF(1) | 2728 TP_DDP_TIMER_WORKAROUND_MASK, 2729 V_TF_DDP_OFF(1) | 2730 TP_DDP_TIMER_WORKAROUND_VAL, 1); 2731 2732#endif 2733 return; 2734reject: 2735 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 2736 mk_pass_accept_rpl(reply_mbuf, m); 2737 else 2738 mk_tid_release(reply_mbuf, NULL, tid); 2739 cxgb_ofld_send(cdev, reply_mbuf); 2740 m_free(m); 2741out: 2742#if 0 2743 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 2744#else 2745 return; 2746#endif 2747} 2748 2749/* 2750 * Handle a CPL_PASS_ACCEPT_REQ message. 2751 */ 2752static int 2753do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2754{ 2755 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; 2756 struct socket *lso = listen_ctx->lso; 2757 struct tom_data *d = listen_ctx->tom_data; 2758 2759#if VALIDATE_TID 2760 struct cpl_pass_accept_req *req = cplhdr(m); 2761 unsigned int tid = GET_TID(req); 2762 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; 2763 2764 if (unlikely(!lsk)) { 2765 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n", 2766 cdev->name, 2767 (unsigned long)((union listen_entry *)ctx - 2768 t->stid_tab)); 2769 return CPL_RET_BUF_DONE; 2770 } 2771 if (unlikely(tid >= t->ntids)) { 2772 printk(KERN_ERR "%s: passive open TID %u too large\n", 2773 cdev->name, tid); 2774 return CPL_RET_BUF_DONE; 2775 } 2776 /* 2777 * For T3A the current user of the TID may have closed but its last 2778 * message(s) may have been backlogged so the TID appears to be still 2779 * in use. Just take the TID away, the connection can close at its 2780 * own leisure. For T3B this situation is a bug. 2781 */ 2782 if (!valid_new_tid(t, tid) && 2783 cdev->type != T3A) { 2784 printk(KERN_ERR "%s: passive open uses existing TID %u\n", 2785 cdev->name, tid); 2786 return CPL_RET_BUF_DONE; 2787 } 2788#endif 2789 2790 process_pass_accept_req(lso, m, &d->tdev, listen_ctx); 2791 return (0); 2792} 2793 2794/* 2795 * Called when a connection is established to translate the TCP options 2796 * reported by HW to Linux's native format. 2797 */ 2798static void 2799assign_rxopt(struct socket *so, unsigned int opt) 2800{ 2801 const struct t3c_data *td = T3C_DATA(T3C_DEV(so)); 2802 struct tcpcb *tp = sototcpcb(so); 2803 struct toepcb *toep = tp->t_toe; 2804 2805 INP_LOCK_ASSERT(tp->t_inpcb); 2806 2807 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40; 2808 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0; 2809 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0; 2810 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0; 2811 if (tp->t_flags & TF_RCVD_SCALE) 2812 tp->rcv_scale = 0; 2813} 2814 2815/* 2816 * Completes some final bits of initialization for just established connections 2817 * and changes their state to TCP_ESTABLISHED. 2818 * 2819 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. 2820 */ 2821static void 2822make_established(struct socket *so, u32 snd_isn, unsigned int opt) 2823{ 2824 struct tcpcb *tp = sototcpcb(so); 2825 struct toepcb *toep = tp->t_toe; 2826 2827 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; 2828 assign_rxopt(so, opt); 2829 so->so_proto->pr_ctloutput = t3_ctloutput; 2830 2831#if 0 2832 inet_sk(sk)->id = tp->write_seq ^ jiffies; 2833#endif 2834 2835 2836 /* 2837 * XXX not clear what rcv_wup maps to 2838 */ 2839 /* 2840 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't 2841 * pass through opt0. 2842 */ 2843 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) 2844 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); 2845 2846 dump_toepcb(toep); 2847 2848#ifdef notyet 2849/* 2850 * no clean interface for marking ARP up to date 2851 */ 2852 dst_confirm(sk->sk_dst_cache); 2853#endif 2854 tp->t_state = TCPS_ESTABLISHED; 2855} 2856 2857static int 2858syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep) 2859{ 2860 2861 struct in_conninfo inc; 2862 struct tcpopt to; 2863 struct tcphdr th; 2864 int mss, wsf, sack, ts; 2865 struct mbuf *m = NULL; 2866 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev); 2867 unsigned int opt; 2868 2869#ifdef MAC 2870#error "no MAC support" 2871#endif 2872 2873 opt = ntohs(req->tcp_opt); 2874 2875 bzero(&to, sizeof(struct tcpopt)); 2876 2877 /* 2878 * Fill out information for entering us into the syncache 2879 */ 2880 inc.inc_fport = th.th_sport = req->peer_port; 2881 inc.inc_lport = th.th_dport = req->local_port; 2882 th.th_seq = req->rcv_isn; 2883 th.th_flags = TH_ACK; 2884 2885 inc.inc_isipv6 = 0; 2886 inc.inc_len = 0; 2887 inc.inc_faddr.s_addr = req->peer_ip; 2888 inc.inc_laddr.s_addr = req->local_ip; 2889 2890 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40; 2891 wsf = G_TCPOPT_WSCALE_OK(opt); 2892 ts = G_TCPOPT_TSTAMP(opt); 2893 sack = G_TCPOPT_SACK(opt); 2894 2895 to.to_mss = mss; 2896 to.to_wscale = G_TCPOPT_SND_WSCALE(opt); 2897 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 2898 2899 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n", 2900 ntohl(req->local_ip), ntohs(req->local_port), 2901 ntohl(req->peer_ip), ntohs(req->peer_port), 2902 mss, wsf, ts, sack); 2903 return syncache_expand(&inc, &to, &th, so, m); 2904} 2905 2906 2907/* 2908 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work 2909 * if we are in TCP_SYN_RECV due to crossed SYNs 2910 */ 2911static int 2912do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2913{ 2914 struct cpl_pass_establish *req = cplhdr(m); 2915 struct toepcb *toep = (struct toepcb *)ctx; 2916 struct tcpcb *tp; 2917 struct socket *so, *lso; 2918 struct t3c_data *td = T3C_DATA(cdev); 2919 // Complete socket initialization now that we have the SND_ISN 2920 2921 struct toedev *tdev; 2922 2923 so = lso = toeptoso(toep); 2924 tdev = toep->tp_toedev; 2925 2926 SOCK_LOCK(so); 2927 LIST_REMOVE(toep, synq_entry); 2928 SOCK_UNLOCK(so); 2929 2930 INP_INFO_WLOCK(&tcbinfo); 2931 if (!syncache_expand_establish_req(req, &so, toep)) { 2932 /* 2933 * No entry 2934 */ 2935 UNIMPLEMENTED(); 2936 } 2937 if (so == NULL) { 2938 /* 2939 * Couldn't create the socket 2940 */ 2941 UNIMPLEMENTED(); 2942 } 2943 2944 /* 2945 * XXX workaround for lack of syncache drop 2946 */ 2947 toepcb_release(toep); 2948 2949 tp = sototcpcb(so); 2950 INP_LOCK(tp->t_inpcb); 2951#ifdef notyet 2952 so->so_snd.sb_flags |= SB_TOE; 2953 so->so_rcv.sb_flags |= SB_TOE; 2954#endif 2955 toep->tp_tp = tp; 2956 toep->tp_flags = 0; 2957 tp->t_toe = toep; 2958 reset_wr_list(toep); 2959 tp->rcv_wnd = select_rcv_wnd(so); 2960 DPRINTF("rcv_wnd=%ld\n", tp->rcv_wnd); 2961 install_offload_ops(so); 2962 2963 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs); 2964 toep->tp_wr_unacked = 0; 2965 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 2966 toep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) && 2967 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 2968 toep->tp_qset_idx = 0; 2969 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu); 2970 2971 /* 2972 * XXX Cancel any keep alive timer 2973 */ 2974 2975 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 2976 INP_INFO_WUNLOCK(&tcbinfo); 2977 INP_UNLOCK(tp->t_inpcb); 2978 soisconnected(so); 2979 2980#ifdef notyet 2981 /* 2982 * XXX not sure how these checks map to us 2983 */ 2984 if (unlikely(sk->sk_socket)) { // simultaneous opens only 2985 sk->sk_state_change(sk); 2986 sk_wake_async(so, 0, POLL_OUT); 2987 } 2988 /* 2989 * The state for the new connection is now up to date. 2990 * Next check if we should add the connection to the parent's 2991 * accept queue. When the parent closes it resets connections 2992 * on its SYN queue, so check if we are being reset. If so we 2993 * don't need to do anything more, the coming ABORT_RPL will 2994 * destroy this socket. Otherwise move the connection to the 2995 * accept queue. 2996 * 2997 * Note that we reset the synq before closing the server so if 2998 * we are not being reset the stid is still open. 2999 */ 3000 if (unlikely(!tp->forward_skb_hint)) { // removed from synq 3001 __kfree_skb(skb); 3002 goto unlock; 3003 } 3004#endif 3005 m_free(m); 3006 3007 return (0); 3008} 3009 3010/* 3011 * Fill in the right TID for CPL messages waiting in the out-of-order queue 3012 * and send them to the TOE. 3013 */ 3014static void 3015fixup_and_send_ofo(struct socket *so) 3016{ 3017 struct mbuf *m; 3018 struct toedev *tdev = TOE_DEV(so); 3019 struct tcpcb *tp = sototcpcb(so); 3020 struct toepcb *toep = tp->t_toe; 3021 unsigned int tid = toep->tp_tid; 3022 3023 printf("fixup_and_send_ofo\n"); 3024 3025 INP_LOCK_ASSERT(tp->t_inpcb); 3026 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { 3027 /* 3028 * A variety of messages can be waiting but the fields we'll 3029 * be touching are common to all so any message type will do. 3030 */ 3031 struct cpl_close_con_req *p = cplhdr(m); 3032 3033 p->wr.wr_lo = htonl(V_WR_TID(tid)); 3034 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); 3035 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3036 } 3037} 3038 3039/* 3040 * Updates socket state from an active establish CPL message. Runs with the 3041 * socket lock held. 3042 */ 3043static void 3044socket_act_establish(struct socket *so, struct mbuf *m) 3045{ 3046 struct cpl_act_establish *req = cplhdr(m); 3047 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ 3048 struct tcpcb *tp = sototcpcb(so); 3049 struct toepcb *toep = tp->t_toe; 3050 3051 if (__predict_false(tp->t_state != TCPS_SYN_SENT)) 3052 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n", 3053 toep->tp_tid, tp->t_state); 3054 3055 tp->ts_recent_age = ticks; 3056 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn; 3057 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs; 3058 3059 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3060 3061 /* 3062 * Now that we finally have a TID send any CPL messages that we had to 3063 * defer for lack of a TID. 3064 */ 3065 if (mbufq_len(&toep->out_of_order_queue)) 3066 fixup_and_send_ofo(so); 3067 3068 if (__predict_false(so->so_state & SS_NOFDREF)) { 3069#ifdef notyet 3070 /* 3071 * XXX not clear what should be done here 3072 * appears to correspond to sorwakeup_locked 3073 */ 3074 sk->sk_state_change(sk); 3075 sk_wake_async(so, 0, POLL_OUT); 3076#endif 3077 } 3078 m_free(m); 3079#ifdef notyet 3080/* 3081 * XXX assume no write requests permitted while socket connection is 3082 * incomplete 3083 */ 3084 /* 3085 * Currently the send queue must be empty at this point because the 3086 * socket layer does not send anything before a connection is 3087 * established. To be future proof though we handle the possibility 3088 * that there are pending buffers to send (either TX_DATA or 3089 * CLOSE_CON_REQ). First we need to adjust the sequence number of the 3090 * buffers according to the just learned write_seq, and then we send 3091 * them on their way. 3092 */ 3093 fixup_pending_writeq_buffers(sk); 3094 if (t3_push_frames(so, 1)) 3095 sk->sk_write_space(sk); 3096#endif 3097 3098 soisconnected(so); 3099 toep->tp_state = tp->t_state = TCPS_ESTABLISHED; 3100 tcpstat.tcps_connects++; 3101 3102} 3103 3104/* 3105 * Process a CPL_ACT_ESTABLISH message. 3106 */ 3107static int 3108do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3109{ 3110 struct cpl_act_establish *req = cplhdr(m); 3111 unsigned int tid = GET_TID(req); 3112 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 3113 struct toepcb *toep = (struct toepcb *)ctx; 3114 struct tcpcb *tp = toep->tp_tp; 3115 struct socket *so; 3116 struct toedev *tdev; 3117 struct tom_data *d; 3118 3119 if (tp == NULL) { 3120 free_atid(cdev, atid); 3121 return (0); 3122 } 3123 3124 so = toeptoso(toep); 3125 tdev = TOE_DEV(so); /* blow up here if link was down */ 3126 d = TOM_DATA(tdev); 3127 3128 INP_LOCK(tp->t_inpcb); 3129 3130 /* 3131 * It's OK if the TID is currently in use, the owning socket may have 3132 * backlogged its last CPL message(s). Just take it away. 3133 */ 3134 toep->tp_tid = tid; 3135 toep->tp_tp = tp; 3136 so_insert_tid(d, so, tid); 3137 free_atid(cdev, atid); 3138 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3139 3140 socket_act_establish(so, m); 3141 INP_UNLOCK(tp->t_inpcb); 3142 return (0); 3143} 3144 3145/* 3146 * Process an acknowledgment of WR completion. Advance snd_una and send the 3147 * next batch of work requests from the write queue. 3148 */ 3149static void 3150wr_ack(struct toepcb *toep, struct mbuf *m) 3151{ 3152 struct tcpcb *tp = toep->tp_tp; 3153 struct cpl_wr_ack *hdr = cplhdr(m); 3154 struct socket *so = toeptoso(toep); 3155 unsigned int credits = ntohs(hdr->credits); 3156 u32 snd_una = ntohl(hdr->snd_una); 3157 int bytes = 0; 3158 3159 DPRINTF("wr_ack: snd_una=%u credits=%d\n", snd_una, credits); 3160 3161 INP_LOCK(tp->t_inpcb); 3162 3163 toep->tp_wr_avail += credits; 3164 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) 3165 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; 3166 3167 while (credits) { 3168 struct mbuf *p = peek_wr(toep); 3169 DPRINTF("p->credits=%d p->bytes=%d\n", p->m_pkthdr.csum_data, p->m_pkthdr.len) ; 3170 3171 if (__predict_false(!p)) { 3172 log(LOG_ERR, "%u WR_ACK credits for TID %u with " 3173 "nothing pending, state %u\n", 3174 credits, toep->tp_tid, tp->t_state); 3175 break; 3176 } 3177 if (__predict_false(credits < p->m_pkthdr.csum_data)) { 3178#if DEBUG_WR > 1 3179 struct tx_data_wr *w = cplhdr(p); 3180#ifdef notyet 3181 log(LOG_ERR, 3182 "TID %u got %u WR credits, need %u, len %u, " 3183 "main body %u, frags %u, seq # %u, ACK una %u," 3184 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", 3185 toep->tp_tid, credits, p->csum, p->len, 3186 p->len - p->data_len, skb_shinfo(p)->nr_frags, 3187 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), 3188 WR_AVAIL(tp), count_pending_wrs(tp) - credits); 3189#endif 3190#endif 3191 p->m_pkthdr.csum_data -= credits; 3192 break; 3193 } else { 3194 dequeue_wr(toep); 3195 credits -= p->m_pkthdr.csum_data; 3196 bytes += p->m_pkthdr.len; 3197 DPRINTF("done with wr of %d bytes\n", p->m_pkthdr.len); 3198 3199 m_free(p); 3200 } 3201 } 3202 3203#if DEBUG_WR 3204 check_wr_invariants(tp); 3205#endif 3206 3207 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 3208#if VALIDATE_SEQ 3209 struct tom_data *d = TOM_DATA(TOE_DEV(so)); 3210 3211 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK " 3212 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una, 3213 toep->tp_tid, tp->snd_una); 3214#endif 3215 goto out_free; 3216 } 3217 3218 if (tp->snd_una != snd_una) { 3219 tp->snd_una = snd_una; 3220 tp->ts_recent_age = ticks; 3221#ifdef notyet 3222 /* 3223 * Keep ARP entry "minty fresh" 3224 */ 3225 dst_confirm(sk->sk_dst_cache); 3226#endif 3227 if (tp->snd_una == tp->snd_nxt) 3228 toep->tp_flags &= ~TP_TX_WAIT_IDLE; 3229 } 3230 if (bytes) { 3231 DPRINTF("sbdrop(%d)\n", bytes); 3232 SOCKBUF_LOCK(&so->so_snd); 3233 sbdrop_locked(&so->so_snd, bytes); 3234 sowwakeup_locked(so); 3235 } 3236 3237 if (so->so_snd.sb_sndptroff < so->so_snd.sb_cc) 3238 t3_push_frames(so, 0); 3239 3240out_free: 3241 INP_UNLOCK(tp->t_inpcb); 3242 m_free(m); 3243} 3244 3245/* 3246 * Handler for TX_DATA_ACK CPL messages. 3247 */ 3248static int 3249do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx) 3250{ 3251 struct toepcb *toep = (struct toepcb *)ctx; 3252 3253 DPRINTF("do_wr_ack\n"); 3254 dump_toepcb(toep); 3255 3256 VALIDATE_SOCK(so); 3257 3258 wr_ack(toep, m); 3259 return 0; 3260} 3261 3262 3263/* 3264 * Reset a connection that is on a listener's SYN queue or accept queue, 3265 * i.e., one that has not had a struct socket associated with it. 3266 * Must be called from process context. 3267 * 3268 * Modeled after code in inet_csk_listen_stop(). 3269 */ 3270static void 3271t3_reset_listen_child(struct socket *child) 3272{ 3273 struct tcpcb *tp = sototcpcb(child); 3274 3275 t3_send_reset(tp->t_toe); 3276} 3277 3278/* 3279 * Disconnect offloaded established but not yet accepted connections sitting 3280 * on a server's accept_queue. We just send an ABORT_REQ at this point and 3281 * finish off the disconnect later as we may need to wait for the ABORT_RPL. 3282 */ 3283void 3284t3_disconnect_acceptq(struct socket *listen_so) 3285{ 3286 struct socket *so; 3287 struct tcpcb *tp; 3288 3289 TAILQ_FOREACH(so, &listen_so->so_comp, so_list) { 3290 tp = sototcpcb(so); 3291 3292 if (tp->t_flags & TF_TOE) { 3293 INP_LOCK(tp->t_inpcb); 3294 t3_reset_listen_child(so); 3295 INP_UNLOCK(tp->t_inpcb); 3296 } 3297 3298 } 3299} 3300 3301/* 3302 * Reset offloaded connections sitting on a server's syn queue. As above 3303 * we send ABORT_REQ and finish off when we get ABORT_RPL. 3304 */ 3305 3306void 3307t3_reset_synq(struct listen_ctx *lctx) 3308{ 3309 struct toepcb *toep; 3310 3311 SOCK_LOCK(lctx->lso); 3312 while (!LIST_EMPTY(&lctx->synq_head)) { 3313 toep = LIST_FIRST(&lctx->synq_head); 3314 LIST_REMOVE(toep, synq_entry); 3315 toep->tp_tp = NULL; 3316 t3_send_reset(toep); 3317 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid); 3318 toepcb_release(toep); 3319 } 3320 SOCK_UNLOCK(lctx->lso); 3321} 3322 3323void 3324t3_init_wr_tab(unsigned int wr_len) 3325{ 3326 int i; 3327 3328 if (mbuf_wrs[1]) /* already initialized */ 3329 return; 3330 3331 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) { 3332 int sgl_len = (3 * i) / 2 + (i & 1); 3333 3334 sgl_len += 3; 3335 mbuf_wrs[i] = sgl_len <= wr_len ? 3336 1 : 1 + (sgl_len - 2) / (wr_len - 1); 3337 } 3338 3339 wrlen = wr_len * 8; 3340} 3341 3342int 3343t3_init_cpl_io(void) 3344{ 3345#ifdef notyet 3346 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); 3347 if (!tcphdr_skb) { 3348 log(LOG_ERR, 3349 "Chelsio TCP offload: can't allocate sk_buff\n"); 3350 return -1; 3351 } 3352 skb_put(tcphdr_skb, sizeof(struct tcphdr)); 3353 tcphdr_skb->h.raw = tcphdr_skb->data; 3354 memset(tcphdr_skb->data, 0, tcphdr_skb->len); 3355#endif 3356 3357 3358 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); 3359 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); 3360 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); 3361 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); 3362 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 3363 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 3364 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 3365 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 3366 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 3367 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); 3368 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); 3369 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 3370#ifdef notyet 3371 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); 3372 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); 3373 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); 3374#endif 3375 return (0); 3376} 3377 3378