cxgb_cpl_io.c revision 174641
1275976Ssmh/************************************************************************** 2265555Sambrisko 3265555SambriskoCopyright (c) 2007, Chelsio Inc. 4275976SsmhAll rights reserved. 5275976Ssmh 6265555SambriskoRedistribution and use in source and binary forms, with or without 7275976Ssmhmodification, are permitted provided that the following conditions are met: 8275976Ssmh 9275976Ssmh 1. Redistributions of source code must retain the above copyright notice, 10265555Sambrisko this list of conditions and the following disclaimer. 11275976Ssmh 12275976Ssmh 2. Neither the name of the Chelsio Corporation nor the names of its 13275976Ssmh contributors may be used to endorse or promote products derived from 14265555Sambrisko this software without specific prior written permission. 15265555Sambrisko 16275976SsmhTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17265555SambriskoAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18275976SsmhIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19275976SsmhARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20275976SsmhLIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21275976SsmhCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22275976SsmhSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23265555SambriskoINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24265555SambriskoCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25265555SambriskoARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26265555SambriskoPOSSIBILITY OF SUCH DAMAGE. 27265555Sambrisko 28265555Sambrisko***************************************************************************/ 29265555Sambrisko 30265555Sambrisko#include <sys/cdefs.h> 31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 174641 2007-12-16 05:27:26Z kmacy $"); 32 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/fcntl.h> 36#include <sys/kernel.h> 37#include <sys/limits.h> 38#include <sys/lock.h> 39#include <sys/mbuf.h> 40#include <sys/mutex.h> 41#include <sys/socket.h> 42#include <sys/sysctl.h> 43#include <sys/syslog.h> 44#include <sys/socketvar.h> 45#include <sys/protosw.h> 46#include <sys/priv.h> 47 48#include <net/if.h> 49#include <net/route.h> 50 51#include <netinet/in.h> 52#include <netinet/in_pcb.h> 53#include <netinet/in_systm.h> 54#include <netinet/in_var.h> 55 56 57#include <dev/cxgb/cxgb_osdep.h> 58#include <dev/cxgb/sys/mbufq.h> 59 60#include <netinet/ip.h> 61#include <netinet/tcp_var.h> 62#include <netinet/tcp_fsm.h> 63#include <netinet/tcp_ofld.h> 64#include <netinet/tcp_seq.h> 65#include <netinet/tcp_syncache.h> 66#include <net/route.h> 67 68 69#include <dev/cxgb/t3cdev.h> 70#include <dev/cxgb/common/cxgb_firmware_exports.h> 71#include <dev/cxgb/common/cxgb_t3_cpl.h> 72#include <dev/cxgb/common/cxgb_tcb.h> 73#include <dev/cxgb/common/cxgb_ctl_defs.h> 74#include <dev/cxgb/cxgb_l2t.h> 75#include <dev/cxgb/cxgb_offload.h> 76#include <vm/vm.h> 77#include <vm/pmap.h> 78#include <machine/bus.h> 79#include <dev/cxgb/sys/mvec.h> 80#include <dev/cxgb/ulp/toecore/cxgb_toedev.h> 81#include <dev/cxgb/ulp/tom/cxgb_defs.h> 82#include <dev/cxgb/ulp/tom/cxgb_tom.h> 83#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> 84#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> 85 86 87 88/* 89 * For ULP connections HW may add headers, e.g., for digests, that aren't part 90 * of the messages sent by the host but that are part of the TCP payload and 91 * therefore consume TCP sequence space. Tx connection parameters that 92 * operate in TCP sequence space are affected by the HW additions and need to 93 * compensate for them to accurately track TCP sequence numbers. This array 94 * contains the compensating extra lengths for ULP packets. It is indexed by 95 * a packet's ULP submode. 96 */ 97const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; 98 99#ifdef notyet 100/* 101 * This sk_buff holds a fake header-only TCP segment that we use whenever we 102 * need to exploit SW TCP functionality that expects TCP headers, such as 103 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple 104 * CPUs without locking. 105 */ 106static struct mbuf *tcphdr_mbuf __read_mostly; 107#endif 108 109/* 110 * Size of WRs in bytes. Note that we assume all devices we are handling have 111 * the same WR size. 112 */ 113static unsigned int wrlen __read_mostly; 114 115/* 116 * The number of WRs needed for an skb depends on the number of page fragments 117 * in the skb and whether it has any payload in its main body. This maps the 118 * length of the gather list represented by an skb into the # of necessary WRs. 119 */ 120static unsigned int mbuf_wrs[TX_MAX_SEGS] __read_mostly; 121 122/* 123 * Max receive window supported by HW in bytes. Only a small part of it can 124 * be set through option0, the rest needs to be set through RX_DATA_ACK. 125 */ 126#define MAX_RCV_WND ((1U << 27) - 1) 127 128/* 129 * Min receive window. We want it to be large enough to accommodate receive 130 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. 131 */ 132#define MIN_RCV_WND (24 * 1024U) 133#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS) 134 135#define VALIDATE_SEQ 0 136#define VALIDATE_SOCK(so) 137#define DEBUG_WR 0 138 139extern int tcp_do_autorcvbuf; 140extern int tcp_do_autosndbuf; 141extern int tcp_autorcvbuf_max; 142extern int tcp_autosndbuf_max; 143 144static void t3_send_reset(struct toepcb *toep); 145static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); 146static inline void free_atid(struct t3cdev *cdev, unsigned int tid); 147static void handle_syncache_event(int event, void *arg); 148 149 150static inline int 151is_t3a(const struct toedev *dev) 152{ 153 return (dev->tod_ttid == TOE_ID_CHELSIO_T3); 154} 155 156static void 157dump_toepcb(struct toepcb *toep) 158{ 159 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n", 160 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode, 161 toep->tp_mtu_idx, toep->tp_tid); 162 163 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n", 164 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, 165 toep->tp_mss_clamp, toep->tp_flags); 166} 167 168static struct rtentry * 169rtalloc2(struct sockaddr *dst, int report, u_long ignflags) 170{ 171 struct rtentry *rt = NULL; 172 173 if ((rt = rtalloc1(dst, report, ignflags)) != NULL) 174 RT_UNLOCK(rt); 175 176 return (rt); 177} 178 179/* 180 * Determine whether to send a CPL message now or defer it. A message is 181 * deferred if the connection is in SYN_SENT since we don't know the TID yet. 182 * For connections in other states the message is sent immediately. 183 * If through_l2t is set the message is subject to ARP processing, otherwise 184 * it is sent directly. 185 */ 186static inline void 187send_or_defer(struct socket *so, struct tcpcb *tp, struct mbuf *m, int through_l2t) 188{ 189 struct toepcb *toep = tp->t_toe; 190 191 192 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) { 193 INP_LOCK(tp->t_inpcb); 194 mbufq_tail(&toep->out_of_order_queue, m); // defer 195 INP_UNLOCK(tp->t_inpcb); 196 } else if (through_l2t) 197 l2t_send(T3C_DEV(so), m, toep->tp_l2t); // send through L2T 198 else 199 cxgb_ofld_send(T3C_DEV(so), m); // send directly 200} 201 202static inline unsigned int 203mkprio(unsigned int cntrl, const struct socket *so) 204{ 205 return cntrl; 206} 207 208/* 209 * Populate a TID_RELEASE WR. The skb must be already propely sized. 210 */ 211static inline void 212mk_tid_release(struct mbuf *m, const struct socket *so, unsigned int tid) 213{ 214 struct cpl_tid_release *req; 215 216 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, so)); 217 m->m_pkthdr.len = m->m_len = sizeof(*req); 218 req = mtod(m, struct cpl_tid_release *); 219 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 220 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); 221} 222 223static inline void 224make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) 225{ 226 struct tcpcb *tp = sototcpcb(so); 227 struct toepcb *toep = tp->t_toe; 228 struct tx_data_wr *req; 229 230 INP_LOCK_ASSERT(tp->t_inpcb); 231 232 req = mtod(m, struct tx_data_wr *); 233 m->m_len = sizeof(*req); 234 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 235 req->wr_lo = htonl(V_WR_TID(toep->tp_tid)); 236 /* len includes the length of any HW ULP additions */ 237 req->len = htonl(len); 238 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); 239 /* V_TX_ULP_SUBMODE sets both the mode and submode */ 240 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) | 241 V_TX_URG(/* skb_urgent(skb) */ 0 ) | 242 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) && 243 (tail ? 0 : 1)))); 244 req->sndseq = htonl(tp->snd_nxt); 245 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { 246 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 247 V_TX_CPU_IDX(toep->tp_qset)); 248 249 /* Sendbuffer is in units of 32KB. 250 */ 251 if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) 252 req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15)); 253 else 254 req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15)); 255 toep->tp_flags |= TP_DATASENT; 256 } 257} 258 259int 260t3_push_frames(struct socket *so, int req_completion) 261{ 262 struct tcpcb *tp = sototcpcb(so); 263 struct toepcb *toep = tp->t_toe; 264 265 struct mbuf *tail, *m0, *last; 266 struct t3cdev *cdev; 267 struct tom_data *d; 268 int bytes, count, total_bytes; 269 bus_dma_segment_t segs[TX_MAX_SEGS], *segp; 270 segp = segs; 271 272 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) { 273 DPRINTF("tcp state=%d\n", tp->t_state); 274 return (0); 275 } 276 277 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) { 278 DPRINTF("disconnecting\n"); 279 280 return (0); 281 } 282 283 INP_LOCK_ASSERT(tp->t_inpcb); 284 285 SOCKBUF_LOCK(&so->so_snd); 286 287 d = TOM_DATA(TOE_DEV(so)); 288 cdev = d->cdev; 289 last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb; 290 total_bytes = 0; 291 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n", 292 toep->tp_wr_avail, tail, so->so_snd.sb_cc, toep->tp_m_last); 293 294 if (last && toep->tp_m_last == last && so->so_snd.sb_sndptroff != 0) { 295 KASSERT(tail, ("sbdrop error")); 296 last = tail = tail->m_next; 297 } 298 299 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) { 300 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail); 301 SOCKBUF_UNLOCK(&so->so_snd); 302 return (0); 303 } 304 305 toep->tp_m_last = NULL; 306 while (toep->tp_wr_avail && (tail != NULL)) { 307 count = bytes = 0; 308 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) { 309 SOCKBUF_UNLOCK(&so->so_snd); 310 return (0); 311 } 312 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) 313 && (tail != NULL) && (count < TX_MAX_SEGS)) { 314 bytes += tail->m_len; 315 count++; 316 last = tail; 317 /* 318 * technically an abuse to be using this for a VA 319 * but less gross than defining my own structure 320 * or calling pmap_kextract from here :-| 321 */ 322 segp->ds_addr = (bus_addr_t)tail->m_data; 323 segp->ds_len = tail->m_len; 324 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", 325 count, mbuf_wrs[count], tail->m_data, tail->m_len); 326 327 segp++; 328 tail = tail->m_next; 329 } 330 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", 331 toep->tp_wr_avail, count, mbuf_wrs[count], tail); 332 if (tail) { 333 so->so_snd.sb_sndptr = tail; 334 toep->tp_m_last = NULL; 335 } else 336 toep->tp_m_last = so->so_snd.sb_sndptr = last; 337 338 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last); 339 340 so->so_snd.sb_sndptroff += bytes; 341 total_bytes += bytes; 342 toep->tp_write_seq += bytes; 343 344 345 SOCKBUF_UNLOCK(&so->so_snd); 346 347 /* 348 * XXX can drop socket buffer lock here 349 */ 350 351 toep->tp_wr_avail -= mbuf_wrs[count]; 352 toep->tp_wr_unacked += mbuf_wrs[count]; 353 354 make_tx_data_wr(so, m0, bytes, tail); 355 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, so)); 356 m_set_sgl(m0, segs); 357 m_set_sgllen(m0, count); 358 /* 359 * remember credits used 360 */ 361 m0->m_pkthdr.csum_data = mbuf_wrs[count]; 362 m0->m_pkthdr.len = bytes; 363 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) || 364 toep->tp_wr_unacked >= toep->tp_wr_max / 2) { 365 struct work_request_hdr *wr = cplhdr(m0); 366 367 wr->wr_hi |= htonl(F_WR_COMPL); 368 toep->tp_wr_unacked = 0; 369 } 370 371 m0->m_type = MT_DONTFREE; 372 enqueue_wr(toep, m0); 373 DPRINTF("sending offload tx with %d bytes in %d segments\n", 374 bytes, count); 375 376 l2t_send(cdev, m0, toep->tp_l2t); 377 if (toep->tp_wr_avail && (tail != NULL)) 378 SOCKBUF_LOCK(&so->so_snd); 379 } 380 381 SOCKBUF_UNLOCK_ASSERT(&so->so_snd); 382 return (total_bytes); 383} 384 385/* 386 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail 387 * under any circumstances. We take the easy way out and always queue the 388 * message to the write_queue. We can optimize the case where the queue is 389 * already empty though the optimization is probably not worth it. 390 */ 391static void 392close_conn(struct socket *so) 393{ 394 struct mbuf *m; 395 struct cpl_close_con_req *req; 396 struct tom_data *d; 397 struct inpcb *inp = sotoinpcb(so); 398 struct tcpcb *tp; 399 struct toepcb *toep; 400 unsigned int tid; 401 402 403 INP_LOCK(inp); 404 tp = sototcpcb(so); 405 toep = tp->t_toe; 406 407 if (tp->t_state != TCPS_SYN_SENT) 408 t3_push_frames(so, 1); 409 410 if (toep->tp_flags & TP_FIN_SENT) { 411 INP_UNLOCK(inp); 412 return; 413 } 414 415 tid = toep->tp_tid; 416 417 d = TOM_DATA(toep->tp_toedev); 418 419 m = m_gethdr_nofail(sizeof(*req)); 420 421 toep->tp_flags |= TP_FIN_SENT; 422 req = mtod(m, struct cpl_close_con_req *); 423 424 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); 425 req->wr.wr_lo = htonl(V_WR_TID(tid)); 426 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 427 req->rsvd = htonl(toep->tp_write_seq); 428 INP_UNLOCK(inp); 429 /* 430 * XXX - need to defer shutdown while there is still data in the queue 431 * 432 */ 433 cxgb_ofld_send(d->cdev, m); 434 435} 436 437/* 438 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant 439 * and send it along. 440 */ 441static void 442abort_arp_failure(struct t3cdev *cdev, struct mbuf *m) 443{ 444 struct cpl_abort_req *req = cplhdr(m); 445 446 req->cmd = CPL_ABORT_NO_RST; 447 cxgb_ofld_send(cdev, m); 448} 449 450/* 451 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are 452 * permitted to return without sending the message in case we cannot allocate 453 * an sk_buff. Returns the number of credits sent. 454 */ 455uint32_t 456t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail) 457{ 458 struct mbuf *m; 459 struct cpl_rx_data_ack *req; 460 struct toepcb *toep = tp->t_toe; 461 struct toedev *tdev = toep->tp_toedev; 462 463 m = m_gethdr_nofail(sizeof(*req)); 464 465 DPRINTF("returning %u credits to HW\n", credits); 466 467 req = mtod(m, struct cpl_rx_data_ack *); 468 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 469 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 470 req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); 471 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toeptoso(toep))); 472 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 473 return (credits); 474} 475 476 477/* 478 * Set of states for which we should return RX credits. 479 */ 480#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) 481 482/* 483 * Called after some received data has been read. It returns RX credits 484 * to the HW for the amount of data processed. 485 */ 486void 487t3_cleanup_rbuf(struct tcpcb *tp) 488{ 489 struct toepcb *toep = tp->t_toe; 490 struct socket *so; 491 struct toedev *dev; 492 int dack_mode, must_send, read; 493 u32 thres, credits, dack = 0; 494 495 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || 496 (tp->t_state == TCPS_FIN_WAIT_2))) 497 return; 498 INP_LOCK_ASSERT(tp->t_inpcb); 499 500 so = tp->t_inpcb->inp_socket; 501 SOCKBUF_LOCK(&so->so_rcv); 502 read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc; 503 toep->tp_copied_seq += read; 504 toep->tp_enqueued_bytes -= read; 505 credits = toep->tp_copied_seq - toep->tp_rcv_wup; 506 SOCKBUF_UNLOCK(&so->so_rcv); 507 508 if (credits > so->so_rcv.sb_mbmax) 509 printf("copied_seq=%u rcv_wup=%u credits=%u\n", 510 toep->tp_copied_seq, toep->tp_rcv_wup, credits); 511 /* 512 * XXX this won't accurately reflect credit return - we need 513 * to look at the difference between the amount that has been 514 * put in the recv sockbuf and what is there now 515 */ 516 517 if (__predict_false(!credits)) 518 return; 519 520 dev = toep->tp_toedev; 521 thres = TOM_TUNABLE(dev, rx_credit_thres); 522 523 if (__predict_false(thres == 0)) 524 return; 525 526 if (toep->tp_ulp_mode) 527 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 528 else { 529 dack_mode = TOM_TUNABLE(dev, delack); 530 if (__predict_false(dack_mode != toep->tp_delack_mode)) { 531 u32 r = tp->rcv_nxt - toep->tp_delack_seq; 532 533 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp) 534 dack = F_RX_DACK_CHANGE | 535 V_RX_DACK_MODE(dack_mode); 536 } 537 } 538 539 /* 540 * For coalescing to work effectively ensure the receive window has 541 * at least 16KB left. 542 */ 543 must_send = credits + 16384 >= tp->rcv_wnd; 544 545 if (must_send || credits >= thres) 546 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send); 547} 548 549static int 550cxgb_toe_disconnect(struct tcpcb *tp) 551{ 552 struct socket *so; 553 554 DPRINTF("cxgb_toe_disconnect\n"); 555 556 so = tp->t_inpcb->inp_socket; 557 close_conn(so); 558 return (0); 559} 560 561static int 562cxgb_toe_abort(struct tcpcb *tp) 563{ 564 struct toepcb *toep = tp->t_toe; 565 566 567 t3_send_reset(toep); 568 569 /* 570 * unhook from socket 571 */ 572 tp->t_flags &= ~TF_TOE; 573 toep->tp_tp = NULL; 574 tp->t_toe = NULL; 575 return (0); 576} 577 578static int 579cxgb_toe_send(struct tcpcb *tp) 580{ 581 struct socket *so; 582 583 DPRINTF("cxgb_toe_send\n"); 584 dump_toepcb(tp->t_toe); 585 586 so = tp->t_inpcb->inp_socket; 587 t3_push_frames(so, 1); 588 return (0); 589} 590 591static int 592cxgb_toe_rcvd(struct tcpcb *tp) 593{ 594 INP_LOCK_ASSERT(tp->t_inpcb); 595 t3_cleanup_rbuf(tp); 596 597 return (0); 598} 599 600static void 601cxgb_toe_detach(struct tcpcb *tp) 602{ 603 struct toepcb *toep; 604 /* 605 * XXX how do we handle teardown in the SYN_SENT state? 606 * 607 */ 608 INP_INFO_WLOCK(&tcbinfo); 609 toep = tp->t_toe; 610 toep->tp_tp = NULL; 611 612 /* 613 * unhook from socket 614 */ 615 tp->t_flags &= ~TF_TOE; 616 tp->t_toe = NULL; 617 INP_INFO_WUNLOCK(&tcbinfo); 618} 619 620 621static struct toe_usrreqs cxgb_toe_usrreqs = { 622 .tu_disconnect = cxgb_toe_disconnect, 623 .tu_abort = cxgb_toe_abort, 624 .tu_send = cxgb_toe_send, 625 .tu_rcvd = cxgb_toe_rcvd, 626 .tu_detach = cxgb_toe_detach, 627 .tu_detach = cxgb_toe_detach, 628 .tu_syncache_event = handle_syncache_event, 629}; 630 631 632static void 633__set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word, 634 uint64_t mask, uint64_t val, int no_reply) 635{ 636 struct cpl_set_tcb_field *req; 637 struct tcpcb *tp = sototcpcb(so); 638 struct toepcb *toep = tp->t_toe; 639 640 req = mtod(m, struct cpl_set_tcb_field *); 641 m->m_pkthdr.len = m->m_len = sizeof(*req); 642 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 643 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid)); 644 req->reply = V_NO_REPLY(no_reply); 645 req->cpu_idx = 0; 646 req->word = htons(word); 647 req->mask = htobe64(mask); 648 req->val = htobe64(val); 649 650 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so)); 651 send_or_defer(so, tp, m, 0); 652} 653 654static void 655t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val) 656{ 657 struct mbuf *m; 658 struct tcpcb *tp = sototcpcb(so); 659 struct toepcb *toep = tp->t_toe; 660 661 if (toep == NULL) 662 return; 663 664 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) 665 return; 666 667 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field)); 668 669 __set_tcb_field(so, m, word, mask, val, 1); 670} 671 672/* 673 * Set one of the t_flags bits in the TCB. 674 */ 675static void 676set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val) 677{ 678 t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos); 679} 680 681/* 682 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. 683 */ 684static void 685t3_set_nagle(struct socket *so) 686{ 687 struct tcpcb *tp = sototcpcb(so); 688 689 set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY)); 690} 691 692/* 693 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. 694 */ 695void 696t3_set_keepalive(struct socket *so, int on_off) 697{ 698 set_tcb_tflag(so, S_TF_KEEPALIVE, on_off); 699} 700 701void 702t3_set_rcv_coalesce_enable(struct socket *so, int on_off) 703{ 704 set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off); 705} 706 707/* 708 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. 709 */ 710static void 711t3_set_tos(struct socket *so) 712{ 713 t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), 714 V_TCB_TOS(SO_TOS(so))); 715} 716 717 718/* 719 * In DDP mode, TP fails to schedule a timer to push RX data to the host when 720 * DDP is disabled (data is delivered to freelist). [Note that, the peer should 721 * set the PSH bit in the last segment, which would trigger delivery.] 722 * We work around the issue by setting a DDP buffer in a partial placed state, 723 * which guarantees that TP will schedule a timer. 724 */ 725#define TP_DDP_TIMER_WORKAROUND_MASK\ 726 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ 727 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ 728 V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) 729#define TP_DDP_TIMER_WORKAROUND_VAL\ 730 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ 731 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\ 732 32)) 733 734static void 735t3_enable_ddp(struct socket *so, int on) 736{ 737 if (on) 738 t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), 739 V_TF_DDP_OFF(0)); 740 else 741 t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, 742 V_TF_DDP_OFF(1) | 743 TP_DDP_TIMER_WORKAROUND_MASK, 744 V_TF_DDP_OFF(1) | 745 TP_DDP_TIMER_WORKAROUND_VAL); 746 747} 748 749 750void 751t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color) 752{ 753 t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx, 754 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 755 tag_color); 756} 757 758void 759t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset, 760 unsigned int len) 761{ 762 if (buf_idx == 0) 763 t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET, 764 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 765 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 766 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) | 767 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 768 else 769 t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET, 770 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 771 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), 772 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) | 773 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32)); 774} 775 776static int 777t3_set_cong_control(struct socket *so, const char *name) 778{ 779#ifdef notyet 780 int cong_algo; 781 782 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) 783 if (!strcmp(name, t3_cong_ops[cong_algo].name)) 784 break; 785 786 if (cong_algo >= ARRAY_SIZE(t3_cong_ops)) 787 return -EINVAL; 788#endif 789 return 0; 790} 791 792int 793t3_get_tcb(struct socket *so) 794{ 795 struct cpl_get_tcb *req; 796 struct tcpcb *tp = sototcpcb(so); 797 struct toepcb *toep = tp->t_toe; 798 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); 799 800 if (!m) 801 return (ENOMEM); 802 803 INP_LOCK_ASSERT(tp->t_inpcb); 804 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so)); 805 req = mtod(m, struct cpl_get_tcb *); 806 m->m_pkthdr.len = m->m_len = sizeof(*req); 807 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 808 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid)); 809 req->cpuno = htons(toep->tp_qset); 810 if (sototcpcb(so)->t_state == TCPS_SYN_SENT) 811 mbufq_tail(&toep->out_of_order_queue, m); // defer 812 else 813 cxgb_ofld_send(T3C_DEV(so), m); 814 return 0; 815} 816 817static inline void 818so_insert_tid(struct tom_data *d, struct socket *so, unsigned int tid) 819{ 820 struct toepcb *toep = sototoep(so); 821 toepcb_hold(toep); 822 823 cxgb_insert_tid(d->cdev, d->client, toep, tid); 824} 825 826/** 827 * find_best_mtu - find the entry in the MTU table closest to an MTU 828 * @d: TOM state 829 * @mtu: the target MTU 830 * 831 * Returns the index of the value in the MTU table that is closest to but 832 * does not exceed the target MTU. 833 */ 834static unsigned int 835find_best_mtu(const struct t3c_data *d, unsigned short mtu) 836{ 837 int i = 0; 838 839 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) 840 ++i; 841 return (i); 842} 843 844static unsigned int 845select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu) 846{ 847 unsigned int idx; 848 849#ifdef notyet 850 struct rtentry *dst = sotoinpcb(so)->inp_route.ro_rt; 851#endif 852 if (tp) { 853 tp->t_maxseg = pmtu - 40; 854 if (tp->t_maxseg < td->mtus[0] - 40) 855 tp->t_maxseg = td->mtus[0] - 40; 856 idx = find_best_mtu(td, tp->t_maxseg + 40); 857 858 tp->t_maxseg = td->mtus[idx] - 40; 859 } else 860 idx = find_best_mtu(td, pmtu); 861 862 return (idx); 863} 864 865void 866t3_release_ddp_resources(struct toepcb *toep) 867{ 868 /* 869 * This is a no-op until we have DDP support 870 */ 871} 872 873static inline void 874free_atid(struct t3cdev *cdev, unsigned int tid) 875{ 876 struct toepcb *toep = cxgb_free_atid(cdev, tid); 877 878 if (toep) 879 toepcb_release(toep); 880} 881 882/* 883 * Release resources held by an offload connection (TID, L2T entry, etc.) 884 */ 885static void 886t3_release_offload_resources(struct toepcb *toep) 887{ 888 struct tcpcb *tp = toep->tp_tp; 889 struct toedev *tdev = toep->tp_toedev; 890 struct t3cdev *cdev; 891 unsigned int tid = toep->tp_tid; 892 893 if (!tdev) 894 return; 895 896 cdev = TOEP_T3C_DEV(toep); 897 if (!cdev) 898 return; 899 900 toep->tp_qset = 0; 901 t3_release_ddp_resources(toep); 902 903#ifdef CTRL_SKB_CACHE 904 kfree_skb(CTRL_SKB_CACHE(tp)); 905 CTRL_SKB_CACHE(tp) = NULL; 906#endif 907 908 if (toep->tp_wr_avail != toep->tp_wr_max) { 909 purge_wr_queue(toep); 910 reset_wr_list(toep); 911 } 912 913 if (toep->tp_l2t) { 914 l2t_release(L2DATA(cdev), toep->tp_l2t); 915 toep->tp_l2t = NULL; 916 } 917 printf("setting toep->tp_tp to NULL\n"); 918 919 toep->tp_tp = NULL; 920 if (tp) { 921 INP_LOCK_ASSERT(tp->t_inpcb); 922 tp->t_toe = NULL; 923 tp->t_flags &= ~TF_TOE; 924 } 925 926 if (toep->tp_state == TCPS_SYN_SENT) { 927 free_atid(cdev, tid); 928#ifdef notyet 929 __skb_queue_purge(&tp->out_of_order_queue); 930#endif 931 } else { // we have TID 932 cxgb_remove_tid(cdev, toep, tid); 933 toepcb_release(toep); 934 } 935#if 0 936 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state); 937#endif 938} 939 940static void 941install_offload_ops(struct socket *so) 942{ 943 struct tcpcb *tp = sototcpcb(so); 944 945 KASSERT(tp->t_toe != NULL, ("toepcb not set")); 946 947 t3_install_socket_ops(so); 948 tp->t_flags |= TF_TOE; 949 tp->t_tu = &cxgb_toe_usrreqs; 950} 951 952/* 953 * Determine the receive window scaling factor given a target max 954 * receive window. 955 */ 956static __inline int 957select_rcv_wscale(int space) 958{ 959 int wscale = 0; 960 961 if (space > MAX_RCV_WND) 962 space = MAX_RCV_WND; 963 964 if (tcp_do_rfc1323) 965 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; 966 return wscale; 967} 968 969/* 970 * Determine the receive window size for a socket. 971 */ 972static unsigned int 973select_rcv_wnd(struct socket *so) 974{ 975 struct toedev *dev = TOE_DEV(so); 976 struct tom_data *d = TOM_DATA(dev); 977 unsigned int wnd; 978 unsigned int max_rcv_wnd; 979 980 if (tcp_do_autorcvbuf) 981 wnd = tcp_autorcvbuf_max; 982 else 983 wnd = sbspace(&so->so_rcv); 984 985 /* XXX 986 * For receive coalescing to work effectively we need a receive window 987 * that can accomodate a coalesced segment. 988 */ 989 if (wnd < MIN_RCV_WND) 990 wnd = MIN_RCV_WND; 991 992 /* PR 5138 */ 993 max_rcv_wnd = (dev->tod_ttid == TOE_ID_CHELSIO_T3B ? 994 (uint32_t)d->rx_page_size * 23 : 995 MAX_RCV_WND); 996 997 return min(wnd, max_rcv_wnd); 998} 999 1000/* 1001 * Assign offload parameters to some socket fields. This code is used by 1002 * both active and passive opens. 1003 */ 1004static inline void 1005init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, 1006 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep) 1007{ 1008 struct tcpcb *tp = sototcpcb(so); 1009 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev); 1010 1011 SOCK_LOCK_ASSERT(so); 1012 1013 printf("initializing offload socket\n"); 1014#ifdef notyet 1015 /* 1016 * We either need to fix push frames to work with sbcompress 1017 * or we need to add this 1018 */ 1019 so->so_rcv.sb_flags |= SB_TOE; 1020 so->so_snd.sb_flags |= SB_TOE; 1021#endif 1022 tp->t_toe = toep; 1023 toep->tp_tp = tp; 1024 toep->tp_toedev = dev; 1025 1026 toep->tp_tid = tid; 1027 toep->tp_l2t = e; 1028 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs); 1029 toep->tp_wr_unacked = 0; 1030 toep->tp_delack_mode = 0; 1031 1032 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu); 1033 /* 1034 * XXX broken 1035 * 1036 */ 1037 tp->rcv_wnd = select_rcv_wnd(so); 1038 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) && 1039 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 1040 toep->tp_qset_idx = 0; 1041 1042 reset_wr_list(toep); 1043 DPRINTF("initialization done\n"); 1044} 1045 1046/* 1047 * The next two functions calculate the option 0 value for a socket. 1048 */ 1049static inline unsigned int 1050calc_opt0h(struct socket *so, int mtu_idx) 1051{ 1052 struct tcpcb *tp = sototcpcb(so); 1053 int wscale = select_rcv_wscale(tp->rcv_wnd); 1054 1055 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) | 1056 V_KEEP_ALIVE((so->so_options & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS | 1057 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx); 1058} 1059 1060static inline unsigned int 1061calc_opt0l(struct socket *so, int ulp_mode) 1062{ 1063 struct tcpcb *tp = sototcpcb(so); 1064 unsigned int val; 1065 1066 val = V_TOS(SO_TOS(so)) | V_ULP_MODE(ulp_mode) | 1067 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ)); 1068 1069 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", SO_TOS(so), tp->rcv_wnd, val); 1070 return (val); 1071} 1072 1073static inline unsigned int 1074calc_opt2(const struct socket *so, struct toedev *dev) 1075{ 1076 int flv_valid; 1077 1078 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1); 1079 1080 return V_FLAVORS_VALID(flv_valid) | 1081 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0); 1082} 1083#if 0 1084(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) 1085#endif 1086 1087static void 1088mk_act_open_req(struct socket *so, struct mbuf *m, 1089 unsigned int atid, const struct l2t_entry *e) 1090{ 1091 struct cpl_act_open_req *req; 1092 struct inpcb *inp = sotoinpcb(so); 1093 struct tcpcb *tp = intotcpcb(inp); 1094 struct toepcb *toep = tp->t_toe; 1095 struct toedev *tdev = TOE_DEV(so); 1096 1097 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, so)); 1098 1099 req = mtod(m, struct cpl_act_open_req *); 1100 m->m_pkthdr.len = m->m_len = sizeof(*req); 1101 1102 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1103 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 1104 req->local_port = inp->inp_lport; 1105 req->peer_port = inp->inp_fport; 1106 memcpy(&req->local_ip, &inp->inp_laddr, 4); 1107 memcpy(&req->peer_ip, &inp->inp_faddr, 4); 1108 DPRINTF("connect smt_idx=%d\n", e->smt_idx); 1109 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) | 1110 V_TX_CHANNEL(e->smt_idx)); 1111 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode)); 1112 req->params = 0; 1113 req->opt2 = htonl(calc_opt2(so, tdev)); 1114} 1115 1116 1117/* 1118 * Convert an ACT_OPEN_RPL status to an errno. 1119 */ 1120static int 1121act_open_rpl_status_to_errno(int status) 1122{ 1123 switch (status) { 1124 case CPL_ERR_CONN_RESET: 1125 return (ECONNREFUSED); 1126 case CPL_ERR_ARP_MISS: 1127 return (EHOSTUNREACH); 1128 case CPL_ERR_CONN_TIMEDOUT: 1129 return (ETIMEDOUT); 1130 case CPL_ERR_TCAM_FULL: 1131 return (ENOMEM); 1132 case CPL_ERR_CONN_EXIST: 1133 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); 1134 return (EADDRINUSE); 1135 default: 1136 return (EIO); 1137 } 1138} 1139 1140static void 1141fail_act_open(struct toepcb *toep, int errno) 1142{ 1143 struct tcpcb *tp = toep->tp_tp; 1144 1145 t3_release_offload_resources(toep); 1146 if (tp) { 1147 INP_LOCK_ASSERT(tp->t_inpcb); 1148 tcp_drop(tp, errno); 1149 } 1150 1151#ifdef notyet 1152 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 1153#endif 1154} 1155 1156/* 1157 * Handle active open failures. 1158 */ 1159static void 1160active_open_failed(struct toepcb *toep, struct mbuf *m) 1161{ 1162 struct cpl_act_open_rpl *rpl = cplhdr(m); 1163 struct inpcb *inp; 1164 1165 INP_INFO_WLOCK(&tcbinfo); 1166 if (toep->tp_tp == NULL) 1167 goto done; 1168 1169 inp = toep->tp_tp->t_inpcb; 1170 INP_LOCK(inp); 1171 1172/* 1173 * Don't handle connection retry for now 1174 */ 1175#ifdef notyet 1176 struct inet_connection_sock *icsk = inet_csk(sk); 1177 1178 if (rpl->status == CPL_ERR_CONN_EXIST && 1179 icsk->icsk_retransmit_timer.function != act_open_retry_timer) { 1180 icsk->icsk_retransmit_timer.function = act_open_retry_timer; 1181 sk_reset_timer(so, &icsk->icsk_retransmit_timer, 1182 jiffies + HZ / 2); 1183 } else 1184#endif 1185 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status)); 1186 INP_UNLOCK(inp); 1187done: 1188 INP_INFO_WUNLOCK(&tcbinfo); 1189 1190 m_free(m); 1191} 1192 1193/* 1194 * Return whether a failed active open has allocated a TID 1195 */ 1196static inline int 1197act_open_has_tid(int status) 1198{ 1199 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && 1200 status != CPL_ERR_ARP_MISS; 1201} 1202 1203/* 1204 * Process an ACT_OPEN_RPL CPL message. 1205 */ 1206static int 1207do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1208{ 1209 struct toepcb *toep = (struct toepcb *)ctx; 1210 struct cpl_act_open_rpl *rpl = cplhdr(m); 1211 1212 if (cdev->type != T3A && act_open_has_tid(rpl->status)) 1213 cxgb_queue_tid_release(cdev, GET_TID(rpl)); 1214 1215 active_open_failed(toep, m); 1216 return (0); 1217} 1218 1219/* 1220 * Handle an ARP failure for an active open. XXX purge ofo queue 1221 * 1222 * XXX badly broken for crossed SYNs as the ATID is no longer valid. 1223 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should 1224 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't 1225 * free the atid. Hmm. 1226 */ 1227#ifdef notyet 1228static void 1229act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m) 1230{ 1231 struct toepcb *toep = m_get_toep(m); 1232 struct tcpcb *tp = toep->tp_tp; 1233 struct inpcb *inp = tp->t_inpcb; 1234 struct socket *so = toeptoso(toep); 1235 1236 INP_LOCK(inp); 1237 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) { 1238 fail_act_open(so, EHOSTUNREACH); 1239 printf("freeing %p\n", m); 1240 1241 m_free(m); 1242 } 1243 INP_UNLOCK(inp); 1244} 1245#endif 1246/* 1247 * Send an active open request. 1248 */ 1249int 1250t3_connect(struct toedev *tdev, struct socket *so, 1251 struct rtentry *rt, struct sockaddr *nam) 1252{ 1253 struct mbuf *m; 1254 struct l2t_entry *e; 1255 struct tom_data *d = TOM_DATA(tdev); 1256 struct inpcb *inp = sotoinpcb(so); 1257 struct tcpcb *tp = intotcpcb(inp); 1258 struct toepcb *toep; /* allocated by init_offload_socket */ 1259 1260 int atid; 1261 1262 toep = toepcb_alloc(); 1263 if (toep == NULL) 1264 goto out_err; 1265 1266 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0) 1267 goto out_err; 1268 1269 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam); 1270 if (!e) 1271 goto free_tid; 1272 1273 INP_LOCK_ASSERT(inp); 1274 m = m_gethdr(MT_DATA, M_WAITOK); 1275 1276#if 0 1277 m->m_toe.mt_toepcb = tp->t_toe; 1278 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure); 1279#endif 1280 SOCK_LOCK(so); 1281 1282 init_offload_socket(so, tdev, atid, e, rt, toep); 1283 1284 install_offload_ops(so); 1285 1286 mk_act_open_req(so, m, atid, e); 1287 SOCK_UNLOCK(so); 1288 1289 soisconnecting(so); 1290 toep = tp->t_toe; 1291 m_set_toep(m, tp->t_toe); 1292 1293 printf("sending off request\n"); 1294 1295 toep->tp_state = TCPS_SYN_SENT; 1296 l2t_send(d->cdev, (struct mbuf *)m, e); 1297 1298 if (toep->tp_ulp_mode) 1299 t3_enable_ddp(so, 0); 1300 return (0); 1301 1302free_tid: 1303 printf("failing connect - free atid\n"); 1304 1305 free_atid(d->cdev, atid); 1306out_err: 1307 printf("return ENOMEM\n"); 1308 return (ENOMEM); 1309} 1310 1311/* 1312 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do 1313 * not send multiple ABORT_REQs for the same connection and also that we do 1314 * not try to send a message after the connection has closed. Returns 1 if 1315 * an ABORT_REQ wasn't generated after all, 0 otherwise. 1316 */ 1317static void 1318t3_send_reset(struct toepcb *toep) 1319{ 1320 1321 struct cpl_abort_req *req; 1322 unsigned int tid = toep->tp_tid; 1323 int mode = CPL_ABORT_SEND_RST; 1324 struct tcpcb *tp = toep->tp_tp; 1325 struct toedev *tdev = toep->tp_toedev; 1326 struct socket *so = NULL; 1327 struct mbuf *m; 1328 1329 if (tp) { 1330 INP_LOCK_ASSERT(tp->t_inpcb); 1331 so = toeptoso(toep); 1332 } 1333 1334 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) || 1335 tdev == NULL)) 1336 return; 1337 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN); 1338 1339 /* Purge the send queue so we don't send anything after an abort. */ 1340 if (so) 1341 sbflush(&so->so_snd); 1342 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev)) 1343 mode |= CPL_ABORT_POST_CLOSE_REQ; 1344 1345 m = m_gethdr_nofail(sizeof(*req)); 1346 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, so)); 1347 set_arp_failure_handler(m, abort_arp_failure); 1348 1349 req = mtod(m, struct cpl_abort_req *); 1350 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1351 req->wr.wr_lo = htonl(V_WR_TID(tid)); 1352 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1353 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0; 1354 req->rsvd1 = !(toep->tp_flags & TP_DATASENT); 1355 req->cmd = mode; 1356 if (tp && (tp->t_state == TCPS_SYN_SENT)) 1357 mbufq_tail(&toep->out_of_order_queue, m); // defer 1358 else 1359 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); 1360} 1361 1362static int 1363t3_ip_ctloutput(struct socket *so, struct sockopt *sopt) 1364{ 1365 struct inpcb *inp; 1366 int error, optval; 1367 1368 if (sopt->sopt_name == IP_OPTIONS) 1369 return (ENOPROTOOPT); 1370 1371 if (sopt->sopt_name != IP_TOS) 1372 return (EOPNOTSUPP); 1373 1374 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 1375 1376 if (error) 1377 return (error); 1378 1379 if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread)) 1380 return (EPERM); 1381 1382 inp = sotoinpcb(so); 1383 inp->inp_ip_tos = optval; 1384 1385 t3_set_tos(so); 1386 1387 return (0); 1388} 1389 1390static int 1391t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt) 1392{ 1393 int err = 0; 1394 size_t copied; 1395 1396 if (sopt->sopt_name != TCP_CONGESTION && 1397 sopt->sopt_name != TCP_NODELAY) 1398 return (EOPNOTSUPP); 1399 1400 if (sopt->sopt_name == TCP_CONGESTION) { 1401 char name[TCP_CA_NAME_MAX]; 1402 int optlen = sopt->sopt_valsize; 1403 struct tcpcb *tp; 1404 1405 if (optlen < 1) 1406 return (EINVAL); 1407 1408 err = copyinstr(sopt->sopt_val, name, 1409 min(TCP_CA_NAME_MAX - 1, optlen), &copied); 1410 if (err) 1411 return (err); 1412 if (copied < 1) 1413 return (EINVAL); 1414 1415 tp = sototcpcb(so); 1416 /* 1417 * XXX I need to revisit this 1418 */ 1419 if ((err = t3_set_cong_control(so, name)) == 0) { 1420#ifdef notyet 1421 tp->t_cong_control = strdup(name, M_CXGB); 1422#endif 1423 } else 1424 return (err); 1425 } else { 1426 int optval, oldval; 1427 struct inpcb *inp; 1428 struct tcpcb *tp; 1429 1430 err = sooptcopyin(sopt, &optval, sizeof optval, 1431 sizeof optval); 1432 1433 if (err) 1434 return (err); 1435 1436 inp = sotoinpcb(so); 1437 tp = intotcpcb(inp); 1438 1439 INP_LOCK(inp); 1440 1441 oldval = tp->t_flags; 1442 if (optval) 1443 tp->t_flags |= TF_NODELAY; 1444 else 1445 tp->t_flags &= ~TF_NODELAY; 1446 INP_UNLOCK(inp); 1447 1448 if (oldval != tp->t_flags) 1449 t3_set_nagle(so); 1450 1451 } 1452 1453 return (0); 1454} 1455 1456static int 1457t3_ctloutput(struct socket *so, struct sockopt *sopt) 1458{ 1459 int err; 1460 1461 if (sopt->sopt_level != IPPROTO_TCP) 1462 err = t3_ip_ctloutput(so, sopt); 1463 else 1464 err = t3_tcp_ctloutput(so, sopt); 1465 1466 if (err != EOPNOTSUPP) 1467 return (err); 1468 1469 return tcp_ctloutput(so, sopt); 1470} 1471 1472/* 1473 * Process new data received for a connection. 1474 */ 1475static void 1476new_rx_data(struct toepcb *toep, struct mbuf *m) 1477{ 1478 struct cpl_rx_data *hdr = cplhdr(m); 1479 struct tcpcb *tp = toep->tp_tp; 1480 struct socket *so = toeptoso(toep); 1481 int len = be16toh(hdr->len); 1482 1483 INP_LOCK(tp->t_inpcb); 1484 1485#ifdef notyet 1486 if (__predict_false(sk_no_receive(sk))) { 1487 handle_excess_rx(so, skb); 1488 return; 1489 } 1490 1491 if (ULP_MODE(tp) == ULP_MODE_TCPDDP) 1492 handle_ddp_data(so, skb); 1493 1494 TCP_SKB_CB(skb)->seq = ntohl(hdr->seq); 1495 TCP_SKB_CB(skb)->flags = 0; 1496 skb_ulp_mode(skb) = 0; /* for iSCSI */ 1497#endif 1498#if VALIDATE_SEQ 1499 if (__predict_false(TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) { 1500 printk(KERN_ERR 1501 "%s: TID %u: Bad sequence number %u, expected %u\n", 1502 TOE_DEV(sk)->name, TID(tp), TCP_SKB_CB(skb)->seq, 1503 tp->rcv_nxt); 1504 __kfree_skb(skb); 1505 return; 1506 } 1507#endif 1508 m_adj(m, sizeof(*hdr)); 1509 1510#ifdef notyet 1511 /* 1512 * We don't handle urgent data yet 1513 */ 1514 if (__predict_false(hdr->urg)) 1515 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg)); 1516 if (__predict_false(tp->urg_data == TCP_URG_NOTYET && 1517 tp->urg_seq - tp->rcv_nxt < skb->len)) 1518 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - 1519 tp->rcv_nxt]; 1520#endif 1521 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) { 1522 toep->tp_delack_mode = hdr->dack_mode; 1523 toep->tp_delack_seq = tp->rcv_nxt; 1524 } 1525 1526 DPRINTF("appending mbuf=%p pktlen=%d m_len=%d len=%d\n", m, m->m_pkthdr.len, m->m_len, len); 1527 1528 if (len < m->m_pkthdr.len) 1529 m->m_pkthdr.len = m->m_len = len; 1530 1531 tp->rcv_nxt += m->m_pkthdr.len; 1532 tp->t_rcvtime = ticks; 1533 toep->tp_enqueued_bytes += m->m_pkthdr.len; 1534#ifdef T3_TRACE 1535 T3_TRACE2(TIDTB(sk), 1536 "new_rx_data: seq 0x%x len %u", 1537 TCP_SKB_CB(skb)->seq, skb->len); 1538#endif 1539 SOCKBUF_LOCK(&so->so_rcv); 1540 if (sb_notify(&so->so_rcv)) 1541 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len); 1542 1543 sbappend_locked(&so->so_rcv, m); 1544 KASSERT(so->so_rcv.sb_cc < so->so_rcv.sb_mbmax, 1545 1546 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d", 1547 so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax)); 1548 1549 INP_UNLOCK(tp->t_inpcb); 1550 DPRINTF("sb_cc=%d sb_mbcnt=%d\n", 1551 so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt); 1552 1553 if (__predict_true((so->so_state & SS_NOFDREF) == 0)) 1554 sorwakeup_locked(so); 1555 else 1556 SOCKBUF_UNLOCK(&so->so_rcv); 1557} 1558 1559/* 1560 * Handler for RX_DATA CPL messages. 1561 */ 1562static int 1563do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1564{ 1565 struct toepcb *toep = (struct toepcb *)ctx; 1566 1567 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len); 1568 1569 new_rx_data(toep, m); 1570 1571 return (0); 1572} 1573 1574static void 1575new_rx_data_ddp(struct socket *so, struct mbuf *m) 1576{ 1577 struct tcpcb *tp = sototcpcb(so); 1578 struct toepcb *toep = tp->t_toe; 1579 struct ddp_state *q; 1580 struct ddp_buf_state *bsp; 1581 struct cpl_rx_data_ddp *hdr; 1582 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; 1583 1584#ifdef notyet 1585 if (unlikely(sk_no_receive(sk))) { 1586 handle_excess_rx(so, m); 1587 return; 1588 } 1589#endif 1590 tp = sototcpcb(so); 1591 q = &toep->tp_ddp_state; 1592 hdr = cplhdr(m); 1593 ddp_report = ntohl(hdr->u.ddp_report); 1594 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 1595 bsp = &q->buf_state[buf_idx]; 1596 1597#ifdef T3_TRACE 1598 T3_TRACE5(TIDTB(sk), 1599 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " 1600 "hdr seq 0x%x len %u offset %u", 1601 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), 1602 ntohs(hdr->len), G_DDP_OFFSET(ddp_report)); 1603 T3_TRACE1(TIDTB(sk), 1604 "new_rx_data_ddp: ddp_report 0x%x", 1605 ddp_report); 1606#endif 1607 1608 ddp_len = ntohs(hdr->len); 1609 rcv_nxt = ntohl(hdr->seq) + ddp_len; 1610 1611 /* 1612 * Overload to store old rcv_next 1613 */ 1614 m->m_pkthdr.csum_data = tp->rcv_nxt; 1615 tp->rcv_nxt = rcv_nxt; 1616 1617 /* 1618 * Store the length in m->m_len. We are changing the meaning of 1619 * m->m_len here, we need to be very careful that nothing from now on 1620 * interprets ->len of this packet the usual way. 1621 */ 1622 m->m_len = tp->rcv_nxt - m->m_pkthdr.csum_data; 1623 1624 /* 1625 * Figure out where the new data was placed in the buffer and store it 1626 * in when. Assumes the buffer offset starts at 0, consumer needs to 1627 * account for page pod's pg_offset. 1628 */ 1629 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; 1630#ifdef notyet 1631 TCP_SKB_CB(skb)->when = end_offset - skb->len; 1632 1633 /* 1634 * We store in mac.raw the address of the gather list where the 1635 * placement happened. 1636 */ 1637 skb->mac.raw = (unsigned char *)bsp->gl; 1638#endif 1639 bsp->cur_offset = end_offset; 1640 1641 /* 1642 * Bit 0 of flags stores whether the DDP buffer is completed. 1643 * Note that other parts of the code depend on this being in bit 0. 1644 */ 1645 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { 1646#if 0 1647 TCP_SKB_CB(skb)->flags = 0; /* potential spurious completion */ 1648#endif 1649 panic("spurious ddp completion"); 1650 } else { 1651 m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); 1652 if (m->m_pkthdr.csum_flags && !(bsp->flags & DDP_BF_NOFLIP)) 1653 q->cur_buf ^= 1; /* flip buffers */ 1654 } 1655 1656 if (bsp->flags & DDP_BF_NOCOPY) { 1657 m->m_pkthdr.csum_flags |= (bsp->flags & DDP_BF_NOCOPY); 1658 bsp->flags &= ~DDP_BF_NOCOPY; 1659 } 1660 1661 if (ddp_report & F_DDP_PSH) 1662 m->m_pkthdr.csum_flags |= DDP_BF_PSH; 1663 1664 tp->t_rcvtime = ticks; 1665 sbappendstream_locked(&so->so_rcv, m); 1666#ifdef notyet 1667 if (!sock_flag(sk, SOCK_DEAD)) 1668 sk->sk_data_ready(sk, 0); 1669#endif 1670} 1671 1672#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 1673 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 1674 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 1675 F_DDP_INVALID_PPOD) 1676 1677/* 1678 * Handler for RX_DATA_DDP CPL messages. 1679 */ 1680static int 1681do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1682{ 1683 struct toepcb *toep = ctx; 1684 struct socket *so = toeptoso(toep); 1685 const struct cpl_rx_data_ddp *hdr = cplhdr(m); 1686 1687 VALIDATE_SOCK(so); 1688 1689 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { 1690 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", 1691 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); 1692 return CPL_RET_BUF_DONE; 1693 } 1694#if 0 1695 skb->h.th = tcphdr_skb->h.th; 1696#endif 1697 new_rx_data_ddp(so, m); 1698 return (0); 1699} 1700 1701static void 1702process_ddp_complete(struct socket *so, struct mbuf *m) 1703{ 1704 struct tcpcb *tp = sototcpcb(so); 1705 struct toepcb *toep = tp->t_toe; 1706 struct ddp_state *q; 1707 struct ddp_buf_state *bsp; 1708 struct cpl_rx_ddp_complete *hdr; 1709 unsigned int ddp_report, buf_idx, when; 1710 1711#ifdef notyet 1712 if (unlikely(sk_no_receive(sk))) { 1713 handle_excess_rx(sk, skb); 1714 return; 1715 } 1716#endif 1717 q = &toep->tp_ddp_state; 1718 hdr = cplhdr(m); 1719 ddp_report = ntohl(hdr->ddp_report); 1720 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 1721 bsp = &q->buf_state[buf_idx]; 1722 1723 when = bsp->cur_offset; 1724 m->m_len = G_DDP_OFFSET(ddp_report) - when; 1725 1726#ifdef T3_TRACE 1727 T3_TRACE5(TIDTB(sk), 1728 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 1729 "ddp_report 0x%x offset %u, len %u", 1730 tp->rcv_nxt, bsp->cur_offset, ddp_report, 1731 G_DDP_OFFSET(ddp_report), skb->len); 1732#endif 1733 1734 bsp->cur_offset += m->m_len; 1735 1736 if (!(bsp->flags & DDP_BF_NOFLIP)) 1737 q->cur_buf ^= 1; /* flip buffers */ 1738 1739#ifdef T3_TRACE 1740 T3_TRACE4(TIDTB(sk), 1741 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 1742 "ddp_report %u offset %u", 1743 tp->rcv_nxt, bsp->cur_offset, ddp_report, 1744 G_DDP_OFFSET(ddp_report)); 1745#endif 1746#if 0 1747 skb->mac.raw = (unsigned char *)bsp->gl; 1748#endif 1749 m->m_pkthdr.csum_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; 1750 if (bsp->flags & DDP_BF_NOCOPY) 1751 bsp->flags &= ~DDP_BF_NOCOPY; 1752 m->m_pkthdr.csum_data = tp->rcv_nxt; 1753 tp->rcv_nxt += m->m_len; 1754 1755 tp->t_rcvtime = ticks; 1756 sbappendstream_locked(&so->so_rcv, m); 1757#ifdef notyet 1758 if (!sock_flag(sk, SOCK_DEAD)) 1759 sk->sk_data_ready(sk, 0); 1760#endif 1761} 1762 1763/* 1764 * Handler for RX_DDP_COMPLETE CPL messages. 1765 */ 1766static int 1767do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1768{ 1769 struct toepcb *toep = ctx; 1770 struct socket *so = toeptoso(toep); 1771 1772 VALIDATE_SOCK(so); 1773#if 0 1774 skb->h.th = tcphdr_skb->h.th; 1775#endif 1776 process_ddp_complete(so, m); 1777 return (0); 1778} 1779 1780/* 1781 * Move a socket to TIME_WAIT state. We need to make some adjustments to the 1782 * socket state before calling tcp_time_wait to comply with its expectations. 1783 */ 1784static void 1785enter_timewait(struct socket *so) 1786{ 1787 struct tcpcb *tp = sototcpcb(so); 1788 1789 INP_LOCK_ASSERT(tp->t_inpcb); 1790 /* 1791 * Bump rcv_nxt for the peer FIN. We don't do this at the time we 1792 * process peer_close because we don't want to carry the peer FIN in 1793 * the socket's receive queue and if we increment rcv_nxt without 1794 * having the FIN in the receive queue we'll confuse facilities such 1795 * as SIOCINQ. 1796 */ 1797 tp->rcv_nxt++; 1798 1799 tp->ts_recent_age = 0; /* defeat recycling */ 1800 tp->t_srtt = 0; /* defeat tcp_update_metrics */ 1801 tcp_twstart(tp); 1802} 1803 1804/* 1805 * Handle a peer FIN. 1806 */ 1807static void 1808do_peer_fin(struct socket *so, struct mbuf *m) 1809{ 1810 struct tcpcb *tp = sototcpcb(so); 1811 struct toepcb *toep = tp->t_toe; 1812 int keep = 0, dead = (so->so_state & SS_NOFDREF); 1813 1814 DPRINTF("do_peer_fin state=%d dead=%d\n", tp->t_state, !!dead); 1815 1816#ifdef T3_TRACE 1817 T3_TRACE0(TIDTB(sk),"do_peer_fin:"); 1818#endif 1819 1820 if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 1821 printf("abort_pending set\n"); 1822 1823 goto out; 1824 } 1825 1826#ifdef notyet 1827 if (ULP_MODE(tp) == ULP_MODE_TCPDDP) { 1828 keep = handle_peer_close_data(so, skb); 1829 if (keep < 0) 1830 return; 1831 } 1832 sk->sk_shutdown |= RCV_SHUTDOWN; 1833 sock_set_flag(so, SOCK_DONE); 1834#endif 1835 INP_INFO_WLOCK(&tcbinfo); 1836 INP_LOCK(tp->t_inpcb); 1837 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) 1838 socantrcvmore(so); 1839 switch (tp->t_state) { 1840 case TCPS_SYN_RECEIVED: 1841 tp->t_starttime = ticks; 1842 /* FALLTHROUGH */ 1843 case TCPS_ESTABLISHED: 1844 tp->t_state = TCPS_CLOSE_WAIT; 1845 break; 1846 case TCPS_FIN_WAIT_1: 1847 tp->t_state = TCPS_CLOSING; 1848 break; 1849 case TCPS_FIN_WAIT_2: 1850 /* 1851 * If we've sent an abort_req we must have sent it too late, 1852 * HW will send us a reply telling us so, and this peer_close 1853 * is really the last message for this connection and needs to 1854 * be treated as an abort_rpl, i.e., transition the connection 1855 * to TCP_CLOSE (note that the host stack does this at the 1856 * time of generating the RST but we must wait for HW). 1857 * Otherwise we enter TIME_WAIT. 1858 */ 1859 t3_release_offload_resources(toep); 1860 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 1861 tp = tcp_close(tp); 1862 } else 1863 enter_timewait(so); 1864 break; 1865 default: 1866 log(LOG_ERR, 1867 "%s: TID %u received PEER_CLOSE in bad state %d\n", 1868 TOE_DEV(so)->tod_name, toep->tp_tid, tp->t_state); 1869 } 1870 INP_INFO_WUNLOCK(&tcbinfo); 1871 if (tp) 1872 INP_UNLOCK(tp->t_inpcb); 1873 1874 if (!dead) { 1875 DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags); 1876 1877 sorwakeup(so); 1878 sowwakeup(so); 1879 wakeup(&so->so_timeo); 1880#ifdef notyet 1881 sk->sk_state_change(sk); 1882 1883 /* Do not send POLL_HUP for half duplex close. */ 1884 if ((sk->sk_shutdown & SEND_SHUTDOWN) || 1885 sk->sk_state == TCP_CLOSE) 1886 sk_wake_async(so, 1, POLL_HUP); 1887 else 1888 sk_wake_async(so, 1, POLL_IN); 1889#endif 1890 } 1891out: 1892 if (!keep) 1893 m_free(m); 1894} 1895 1896/* 1897 * Handler for PEER_CLOSE CPL messages. 1898 */ 1899static int 1900do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1901{ 1902 struct toepcb *toep = (struct toepcb *)ctx; 1903 struct socket *so = toeptoso(toep); 1904 1905 VALIDATE_SOCK(so); 1906 1907 do_peer_fin(so, m); 1908 return (0); 1909} 1910 1911static void 1912process_close_con_rpl(struct socket *so, struct mbuf *m) 1913{ 1914 struct tcpcb *tp = sototcpcb(so); 1915 struct cpl_close_con_rpl *rpl = cplhdr(m); 1916 struct toepcb *toep = tp->t_toe; 1917 1918 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ 1919 1920 DPRINTF("process_close_con_rpl(%p) state=%d dead=%d\n", so, tp->t_state, 1921 !!(so->so_state & SS_NOFDREF)); 1922 if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) 1923 goto out; 1924 1925 INP_INFO_WLOCK(&tcbinfo); 1926 INP_LOCK(tp->t_inpcb); 1927 switch (tp->t_state) { 1928 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ 1929 t3_release_offload_resources(toep); 1930 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 1931 tp = tcp_close(tp); 1932 1933 } else 1934 enter_timewait(so); 1935 break; 1936 case TCPS_LAST_ACK: 1937 /* 1938 * In this state we don't care about pending abort_rpl. 1939 * If we've sent abort_req it was post-close and was sent too 1940 * late, this close_con_rpl is the actual last message. 1941 */ 1942 t3_release_offload_resources(toep); 1943 tp = tcp_close(tp); 1944 break; 1945 case TCPS_FIN_WAIT_1: 1946#ifdef notyet 1947 dst_confirm(sk->sk_dst_cache); 1948#endif 1949 soisdisconnecting(so); 1950 1951 if ((so->so_state & SS_NOFDREF) == 0) { 1952 /* 1953 * Wake up lingering close 1954 */ 1955 sowwakeup(so); 1956 sorwakeup(so); 1957 wakeup(&so->so_timeo); 1958 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0 && 1959 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { 1960 tp = tcp_drop(tp, 0); 1961 } 1962 1963 break; 1964 default: 1965 log(LOG_ERR, 1966 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", 1967 TOE_DEV(so)->tod_name, toep->tp_tid, 1968 tp->t_state); 1969 } 1970 INP_INFO_WUNLOCK(&tcbinfo); 1971 if (tp) 1972 INP_UNLOCK(tp->t_inpcb); 1973out: 1974 m_free(m); 1975} 1976 1977/* 1978 * Handler for CLOSE_CON_RPL CPL messages. 1979 */ 1980static int 1981do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m, 1982 void *ctx) 1983{ 1984 struct toepcb *toep = (struct toepcb *)ctx; 1985 struct socket *so = toeptoso(toep); 1986 1987 VALIDATE_SOCK(so); 1988 1989 process_close_con_rpl(so, m); 1990 return (0); 1991} 1992 1993/* 1994 * Process abort replies. We only process these messages if we anticipate 1995 * them as the coordination between SW and HW in this area is somewhat lacking 1996 * and sometimes we get ABORT_RPLs after we are done with the connection that 1997 * originated the ABORT_REQ. 1998 */ 1999static void 2000process_abort_rpl(struct socket *so, struct mbuf *m) 2001{ 2002 struct tcpcb *tp = sototcpcb(so); 2003 struct toepcb *toep = tp->t_toe; 2004 2005#ifdef T3_TRACE 2006 T3_TRACE1(TIDTB(sk), 2007 "process_abort_rpl: GTS rpl pending %d", 2008 sock_flag(sk, ABORT_RPL_PENDING)); 2009#endif 2010 INP_LOCK(tp->t_inpcb); 2011 2012 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2013 /* 2014 * XXX panic on tcpdrop 2015 */ 2016 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(TOE_DEV(so))) 2017 toep->tp_flags |= TP_ABORT_RPL_RCVD; 2018 else { 2019 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING); 2020 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) || 2021 !is_t3a(TOE_DEV(so))) { 2022 if (toep->tp_flags & TP_ABORT_REQ_RCVD) 2023 panic("TP_ABORT_REQ_RCVD set"); 2024 INP_INFO_WLOCK(&tcbinfo); 2025 INP_LOCK(tp->t_inpcb); 2026 t3_release_offload_resources(toep); 2027 tp = tcp_close(tp); 2028 INP_INFO_WUNLOCK(&tcbinfo); 2029 } 2030 } 2031 } 2032 if (tp) 2033 INP_UNLOCK(tp->t_inpcb); 2034 2035 m_free(m); 2036} 2037 2038/* 2039 * Handle an ABORT_RPL_RSS CPL message. 2040 */ 2041static int 2042do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2043{ 2044 struct socket *so; 2045 struct cpl_abort_rpl_rss *rpl = cplhdr(m); 2046 struct toepcb *toep; 2047 2048 /* 2049 * Ignore replies to post-close aborts indicating that the abort was 2050 * requested too late. These connections are terminated when we get 2051 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss 2052 * arrives the TID is either no longer used or it has been recycled. 2053 */ 2054 if (rpl->status == CPL_ERR_ABORT_FAILED) { 2055discard: 2056 m_free(m); 2057 return (0); 2058 } 2059 2060 toep = (struct toepcb *)ctx; 2061 2062 /* 2063 * Sometimes we've already closed the socket, e.g., a post-close 2064 * abort races with ABORT_REQ_RSS, the latter frees the socket 2065 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, 2066 * but FW turns the ABORT_REQ into a regular one and so we get 2067 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. 2068 */ 2069 if (!toep) 2070 goto discard; 2071 2072 if (toep->tp_tp == NULL) { 2073 printf("removing tid for abort\n"); 2074 cxgb_remove_tid(cdev, toep, toep->tp_tid); 2075 if (toep->tp_l2t) 2076 l2t_release(L2DATA(cdev), toep->tp_l2t); 2077 2078 toepcb_release(toep); 2079 goto discard; 2080 } 2081 2082 printf("toep=%p\n", toep); 2083 printf("tp=%p\n", toep->tp_tp); 2084 2085 so = toeptoso(toep); /* <- XXX panic */ 2086 toepcb_hold(toep); 2087 process_abort_rpl(so, m); 2088 toepcb_release(toep); 2089 return (0); 2090} 2091 2092/* 2093 * Convert the status code of an ABORT_REQ into a Linux error code. Also 2094 * indicate whether RST should be sent in response. 2095 */ 2096static int 2097abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst) 2098{ 2099 struct tcpcb *tp = sototcpcb(so); 2100 2101 switch (abort_reason) { 2102 case CPL_ERR_BAD_SYN: 2103#if 0 2104 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through 2105#endif 2106 case CPL_ERR_CONN_RESET: 2107 // XXX need to handle SYN_RECV due to crossed SYNs 2108 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 2109 case CPL_ERR_XMIT_TIMEDOUT: 2110 case CPL_ERR_PERSIST_TIMEDOUT: 2111 case CPL_ERR_FINWAIT2_TIMEDOUT: 2112 case CPL_ERR_KEEPALIVE_TIMEDOUT: 2113#if 0 2114 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); 2115#endif 2116 return (ETIMEDOUT); 2117 default: 2118 return (EIO); 2119 } 2120} 2121 2122static inline void 2123set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd) 2124{ 2125 struct cpl_abort_rpl *rpl = cplhdr(m); 2126 2127 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); 2128 rpl->wr.wr_lo = htonl(V_WR_TID(tid)); 2129 m->m_len = m->m_pkthdr.len = sizeof(*rpl); 2130 2131 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); 2132 rpl->cmd = cmd; 2133} 2134 2135static void 2136send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m) 2137{ 2138 struct mbuf *reply_mbuf; 2139 struct cpl_abort_req_rss *req = cplhdr(m); 2140 2141 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl)); 2142 m_set_priority(m, CPL_PRIORITY_DATA); 2143 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl); 2144 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status); 2145 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2146 m_free(m); 2147} 2148 2149/* 2150 * Returns whether an ABORT_REQ_RSS message is a negative advice. 2151 */ 2152static inline int 2153is_neg_adv_abort(unsigned int status) 2154{ 2155 return status == CPL_ERR_RTX_NEG_ADVICE || 2156 status == CPL_ERR_PERSIST_NEG_ADVICE; 2157} 2158 2159static void 2160send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status) 2161{ 2162 struct mbuf *reply_mbuf; 2163 struct cpl_abort_req_rss *req = cplhdr(m); 2164 2165 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2166 2167 if (!reply_mbuf) { 2168 /* Defer the reply. Stick rst_status into req->cmd. */ 2169 req->status = rst_status; 2170 t3_defer_reply(m, tdev, send_deferred_abort_rpl); 2171 return; 2172 } 2173 2174 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA); 2175 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status); 2176 m_free(m); 2177 2178 /* 2179 * XXX need to sync with ARP as for SYN_RECV connections we can send 2180 * these messages while ARP is pending. For other connection states 2181 * it's not a problem. 2182 */ 2183 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2184} 2185 2186#ifdef notyet 2187static void 2188cleanup_syn_rcv_conn(struct socket *child, struct socket *parent) 2189{ 2190 UNIMPLEMENTED(); 2191#ifdef notyet 2192 struct request_sock *req = child->sk_user_data; 2193 2194 inet_csk_reqsk_queue_removed(parent, req); 2195 synq_remove(tcp_sk(child)); 2196 __reqsk_free(req); 2197 child->sk_user_data = NULL; 2198#endif 2199} 2200 2201 2202/* 2203 * Performs the actual work to abort a SYN_RECV connection. 2204 */ 2205static void 2206do_abort_syn_rcv(struct socket *child, struct socket *parent) 2207{ 2208 struct tcpcb *parenttp = sototcpcb(parent); 2209 struct tcpcb *childtp = sototcpcb(child); 2210 2211 /* 2212 * If the server is still open we clean up the child connection, 2213 * otherwise the server already did the clean up as it was purging 2214 * its SYN queue and the skb was just sitting in its backlog. 2215 */ 2216 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) { 2217 cleanup_syn_rcv_conn(child, parent); 2218 INP_INFO_WLOCK(&tcbinfo); 2219 INP_LOCK(childtp->t_inpcb); 2220 t3_release_offload_resources(childtp->t_toe); 2221 childtp = tcp_close(childtp); 2222 INP_INFO_WUNLOCK(&tcbinfo); 2223 if (childtp) 2224 INP_UNLOCK(childtp->t_inpcb); 2225 } 2226} 2227#endif 2228 2229/* 2230 * Handle abort requests for a SYN_RECV connection. These need extra work 2231 * because the socket is on its parent's SYN queue. 2232 */ 2233static int 2234abort_syn_rcv(struct socket *so, struct mbuf *m) 2235{ 2236 UNIMPLEMENTED(); 2237#ifdef notyet 2238 struct socket *parent; 2239 struct toedev *tdev = TOE_DEV(so); 2240 struct t3cdev *cdev = TOM_DATA(tdev)->cdev; 2241 struct socket *oreq = so->so_incomp; 2242 struct t3c_tid_entry *t3c_stid; 2243 struct tid_info *t; 2244 2245 if (!oreq) 2246 return -1; /* somehow we are not on the SYN queue */ 2247 2248 t = &(T3C_DATA(cdev))->tid_maps; 2249 t3c_stid = lookup_stid(t, oreq->ts_recent); 2250 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 2251 2252 SOCK_LOCK(parent); 2253 do_abort_syn_rcv(so, parent); 2254 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST); 2255 SOCK_UNLOCK(parent); 2256#endif 2257 return (0); 2258} 2259 2260/* 2261 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this 2262 * request except that we need to reply to it. 2263 */ 2264static void 2265process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev) 2266{ 2267 int rst_status = CPL_ABORT_NO_RST; 2268 const struct cpl_abort_req_rss *req = cplhdr(m); 2269 struct tcpcb *tp = sototcpcb(so); 2270 struct toepcb *toep = tp->t_toe; 2271 2272 INP_LOCK(tp->t_inpcb); 2273 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) { 2274 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN); 2275 m_free(m); 2276 goto skip; 2277 } 2278 2279 toep->tp_flags &= ~TP_ABORT_REQ_RCVD; 2280 /* 2281 * Three cases to consider: 2282 * a) We haven't sent an abort_req; close the connection. 2283 * b) We have sent a post-close abort_req that will get to TP too late 2284 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will 2285 * be ignored and the connection should be closed now. 2286 * c) We have sent a regular abort_req that will get to TP too late. 2287 * That will generate an abort_rpl with status 0, wait for it. 2288 */ 2289 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) || 2290 (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) { 2291 so->so_error = abort_status_to_errno(so, req->status, 2292 &rst_status); 2293#if 0 2294 if (!sock_flag(sk, SOCK_DEAD)) 2295 sk->sk_error_report(sk); 2296#endif 2297 /* 2298 * SYN_RECV needs special processing. If abort_syn_rcv() 2299 * returns 0 is has taken care of the abort. 2300 */ 2301 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m)) 2302 goto skip; 2303 2304 t3_release_offload_resources(toep); 2305 tp = tcp_close(tp); 2306 } 2307 if (tp) 2308 INP_UNLOCK(tp->t_inpcb); 2309 send_abort_rpl(m, tdev, rst_status); 2310 return; 2311 2312skip: 2313 INP_UNLOCK(tp->t_inpcb); 2314} 2315 2316/* 2317 * Handle an ABORT_REQ_RSS CPL message. 2318 */ 2319static int 2320do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2321{ 2322 const struct cpl_abort_req_rss *req = cplhdr(m); 2323 struct toepcb *toep = (struct toepcb *)ctx; 2324 struct socket *so; 2325 struct inpcb *inp; 2326 2327 if (is_neg_adv_abort(req->status)) { 2328 m_free(m); 2329 return (0); 2330 } 2331 2332 printf("aborting tid=%d\n", toep->tp_tid); 2333 2334 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) { 2335 cxgb_remove_tid(cdev, toep, toep->tp_tid); 2336 toep->tp_flags |= TP_ABORT_REQ_RCVD; 2337 printf("sending abort rpl\n"); 2338 2339 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST); 2340 printf("sent\n"); 2341 if (toep->tp_l2t) 2342 l2t_release(L2DATA(cdev), toep->tp_l2t); 2343 2344 /* 2345 * Unhook 2346 */ 2347 toep->tp_tp->t_toe = NULL; 2348 toep->tp_tp->t_flags &= ~TF_TOE; 2349 toep->tp_tp = NULL; 2350 /* 2351 * XXX need to call syncache_chkrst - but we don't 2352 * have a way of doing that yet 2353 */ 2354 toepcb_release(toep); 2355 printf("abort for unestablished connection :-(\n"); 2356 return (0); 2357 } 2358 if (toep->tp_tp == NULL) { 2359 printf("disconnected toepcb\n"); 2360 /* should be freed momentarily */ 2361 return (0); 2362 } 2363 2364 so = toeptoso(toep); 2365 inp = sotoinpcb(so); 2366 2367 VALIDATE_SOCK(so); 2368 toepcb_hold(toep); 2369 INP_INFO_WLOCK(&tcbinfo); 2370 process_abort_req(so, m, TOE_DEV(so)); 2371 INP_INFO_WUNLOCK(&tcbinfo); 2372 toepcb_release(toep); 2373 return (0); 2374} 2375#ifdef notyet 2376static void 2377pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m) 2378{ 2379 struct toedev *tdev = TOE_DEV(parent); 2380 2381 do_abort_syn_rcv(child, parent); 2382 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) { 2383 struct cpl_pass_accept_rpl *rpl = cplhdr(m); 2384 2385 rpl->opt0h = htonl(F_TCAM_BYPASS); 2386 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 2387 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 2388 } else 2389 m_free(m); 2390} 2391#endif 2392static void 2393handle_pass_open_arp_failure(struct socket *so, struct mbuf *m) 2394{ 2395 UNIMPLEMENTED(); 2396 2397#ifdef notyet 2398 struct t3cdev *cdev; 2399 struct socket *parent; 2400 struct socket *oreq; 2401 struct t3c_tid_entry *t3c_stid; 2402 struct tid_info *t; 2403 struct tcpcb *otp, *tp = sototcpcb(so); 2404 struct toepcb *toep = tp->t_toe; 2405 2406 /* 2407 * If the connection is being aborted due to the parent listening 2408 * socket going away there's nothing to do, the ABORT_REQ will close 2409 * the connection. 2410 */ 2411 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2412 m_free(m); 2413 return; 2414 } 2415 2416 oreq = so->so_incomp; 2417 otp = sototcpcb(oreq); 2418 2419 cdev = T3C_DEV(so); 2420 t = &(T3C_DATA(cdev))->tid_maps; 2421 t3c_stid = lookup_stid(t, otp->ts_recent); 2422 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 2423 2424 SOCK_LOCK(parent); 2425 pass_open_abort(so, parent, m); 2426 SOCK_UNLOCK(parent); 2427#endif 2428} 2429 2430/* 2431 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly 2432 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV 2433 * connection. 2434 */ 2435static void 2436pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m) 2437{ 2438 2439#ifdef notyet 2440 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 2441 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk); 2442#endif 2443 handle_pass_open_arp_failure(m_get_socket(m), m); 2444} 2445 2446/* 2447 * Populate a reject CPL_PASS_ACCEPT_RPL WR. 2448 */ 2449static void 2450mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf) 2451{ 2452 struct cpl_pass_accept_req *req = cplhdr(req_mbuf); 2453 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf); 2454 unsigned int tid = GET_TID(req); 2455 2456 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP); 2457 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 2458 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 2459 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet 2460 rpl->opt0h = htonl(F_TCAM_BYPASS); 2461 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 2462 rpl->opt2 = 0; 2463 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 2464} 2465 2466/* 2467 * Send a deferred reject to an accept request. 2468 */ 2469static void 2470reject_pass_request(struct toedev *tdev, struct mbuf *m) 2471{ 2472 struct mbuf *reply_mbuf; 2473 2474 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl)); 2475 mk_pass_accept_rpl(reply_mbuf, m); 2476 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2477 m_free(m); 2478} 2479 2480static void 2481handle_syncache_event(int event, void *arg) 2482{ 2483 struct toepcb *toep = arg; 2484 2485 switch (event) { 2486 case SC_ENTRY_PRESENT: 2487 /* 2488 * entry already exists - free toepcb 2489 * and l2t 2490 */ 2491 printf("syncache entry present\n"); 2492 toepcb_release(toep); 2493 break; 2494 case SC_DROP: 2495 /* 2496 * The syncache has given up on this entry 2497 * either it timed out, or it was evicted 2498 * we need to explicitly release the tid 2499 */ 2500 printf("syncache entry dropped\n"); 2501 toepcb_release(toep); 2502 break; 2503 default: 2504 log(LOG_ERR, "unknown syncache event %d\n", event); 2505 break; 2506 } 2507} 2508 2509static void 2510syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep) 2511{ 2512 struct in_conninfo inc; 2513 struct tcpopt to; 2514 struct tcphdr th; 2515 struct inpcb *inp; 2516 int mss, wsf, sack, ts; 2517 2518 bzero(&to, sizeof(struct tcpopt)); 2519 inp = sotoinpcb(lso); 2520 2521 /* 2522 * Fill out information for entering us into the syncache 2523 */ 2524 inc.inc_fport = th.th_sport = req->peer_port; 2525 inc.inc_lport = th.th_dport = req->local_port; 2526 toep->tp_iss = th.th_seq = req->rcv_isn; 2527 th.th_flags = TH_SYN; 2528 2529 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = ntohl(req->rcv_isn); 2530 2531 inc.inc_isipv6 = 0; 2532 inc.inc_len = 0; 2533 inc.inc_faddr.s_addr = req->peer_ip; 2534 inc.inc_laddr.s_addr = req->local_ip; 2535 2536 DPRINTF("syncache add of %d:%d %d:%d\n", 2537 ntohl(req->local_ip), ntohs(req->local_port), 2538 ntohl(req->peer_ip), ntohs(req->peer_port)); 2539 2540 mss = req->tcp_options.mss; 2541 wsf = req->tcp_options.wsf; 2542 ts = req->tcp_options.tstamp; 2543 sack = req->tcp_options.sack; 2544 to.to_mss = mss; 2545 to.to_wscale = wsf; 2546 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 2547 2548 INP_INFO_WLOCK(&tcbinfo); 2549 INP_LOCK(inp); 2550 syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep); 2551} 2552 2553 2554/* 2555 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket 2556 * lock held. Note that the sock here is a listening socket that is not owned 2557 * by the TOE. 2558 */ 2559static void 2560process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, 2561 struct listen_ctx *lctx) 2562{ 2563 int rt_flags; 2564 struct l2t_entry *e; 2565 struct iff_mac tim; 2566 struct mbuf *reply_mbuf, *ddp_mbuf = NULL; 2567 struct cpl_pass_accept_rpl *rpl; 2568 struct cpl_pass_accept_req *req = cplhdr(m); 2569 unsigned int tid = GET_TID(req); 2570 struct tom_data *d = TOM_DATA(tdev); 2571 struct t3cdev *cdev = d->cdev; 2572 struct tcpcb *tp = sototcpcb(so); 2573 struct toepcb *newtoep; 2574 struct rtentry *dst; 2575 struct sockaddr_in nam; 2576 struct t3c_data *td = T3C_DATA(cdev); 2577 2578 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2579 if (__predict_false(reply_mbuf == NULL)) { 2580 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 2581 t3_defer_reply(m, tdev, reject_pass_request); 2582 else { 2583 cxgb_queue_tid_release(cdev, tid); 2584 m_free(m); 2585 } 2586 DPRINTF("failed to get reply_mbuf\n"); 2587 2588 goto out; 2589 } 2590 2591 if (tp->t_state != TCPS_LISTEN) { 2592 DPRINTF("socket not in listen state\n"); 2593 2594 goto reject; 2595 } 2596 2597 tim.mac_addr = req->dst_mac; 2598 tim.vlan_tag = ntohs(req->vlan_tag); 2599 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { 2600 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n"); 2601 goto reject; 2602 } 2603 2604#ifdef notyet 2605 /* 2606 * XXX do route lookup to confirm that we're still listening on this 2607 * address 2608 */ 2609 if (ip_route_input(skb, req->local_ip, req->peer_ip, 2610 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev)) 2611 goto reject; 2612 rt_flags = ((struct rtable *)skb->dst)->rt_flags & 2613 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); 2614 dst_release(skb->dst); // done with the input route, release it 2615 skb->dst = NULL; 2616 2617 if ((rt_flags & RTF_LOCAL) == 0) 2618 goto reject; 2619#endif 2620 /* 2621 * XXX 2622 */ 2623 rt_flags = RTF_LOCAL; 2624 if ((rt_flags & RTF_LOCAL) == 0) 2625 goto reject; 2626 2627 /* 2628 * Calculate values and add to syncache 2629 */ 2630 2631 newtoep = toepcb_alloc(); 2632 if (newtoep == NULL) 2633 goto reject; 2634 2635 bzero(&nam, sizeof(struct sockaddr_in)); 2636 2637 nam.sin_len = sizeof(struct sockaddr_in); 2638 nam.sin_family = AF_INET; 2639 nam.sin_addr.s_addr =req->peer_ip; 2640 dst = rtalloc2((struct sockaddr *)&nam, 1, 0); 2641 2642 if (dst == NULL) { 2643 printf("failed to find route\n"); 2644 goto reject; 2645 } 2646 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev, 2647 (struct sockaddr *)&nam); 2648 if (e == NULL) { 2649 DPRINTF("failed to get l2t\n"); 2650 } 2651 /* 2652 * Point to our listen socket until accept 2653 */ 2654 newtoep->tp_tp = tp; 2655 newtoep->tp_flags = TP_SYN_RCVD; 2656 newtoep->tp_tid = tid; 2657 newtoep->tp_toedev = tdev; 2658 2659 printf("inserting tid=%d\n", tid); 2660 cxgb_insert_tid(cdev, d->client, newtoep, tid); 2661 SOCK_LOCK(so); 2662 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry); 2663 SOCK_UNLOCK(so); 2664 2665 2666 if (lctx->ulp_mode) { 2667 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2668 2669 if (!ddp_mbuf) 2670 newtoep->tp_ulp_mode = 0; 2671 else 2672 newtoep->tp_ulp_mode = lctx->ulp_mode; 2673 } 2674 2675 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure); 2676 2677 DPRINTF("adding request to syn cache\n"); 2678 2679 /* 2680 * XXX workaround for lack of syncache drop 2681 */ 2682 toepcb_hold(newtoep); 2683 syncache_add_accept_req(req, so, newtoep); 2684 2685 2686 2687 rpl = cplhdr(reply_mbuf); 2688 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl); 2689 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 2690 rpl->wr.wr_lo = 0; 2691 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 2692 rpl->opt2 = htonl(calc_opt2(so, tdev)); 2693 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 2694 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten 2695 2696 DPRINTF("accept smt_idx=%d\n", e->smt_idx); 2697 2698 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) | 2699 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); 2700 rpl->opt0l_status = htonl(calc_opt0l(so, lctx->ulp_mode) | 2701 CPL_PASS_OPEN_ACCEPT); 2702 2703 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status); 2704 2705 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, so)); 2706 2707#ifdef DEBUG_PRINT 2708 { 2709 int i; 2710 2711 DPRINTF("rpl:\n"); 2712 uint32_t *rplbuf = mtod(reply_mbuf, uint32_t *); 2713 2714 for (i = 0; i < sizeof(*rpl)/sizeof(uint32_t); i++) 2715 DPRINTF("[%d] %08x\n", i, rplbuf[i]); 2716 } 2717#endif 2718 2719 2720 l2t_send(cdev, reply_mbuf, e); 2721 m_free(m); 2722#ifdef notyet 2723 /* 2724 * XXX this call path has to be converted to not depend on sockets 2725 */ 2726 if (newtoep->tp_ulp_mode) 2727 __set_tcb_field(newso, ddp_mbuf, W_TCB_RX_DDP_FLAGS, 2728 V_TF_DDP_OFF(1) | 2729 TP_DDP_TIMER_WORKAROUND_MASK, 2730 V_TF_DDP_OFF(1) | 2731 TP_DDP_TIMER_WORKAROUND_VAL, 1); 2732 2733#endif 2734 return; 2735reject: 2736 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 2737 mk_pass_accept_rpl(reply_mbuf, m); 2738 else 2739 mk_tid_release(reply_mbuf, NULL, tid); 2740 cxgb_ofld_send(cdev, reply_mbuf); 2741 m_free(m); 2742out: 2743#if 0 2744 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 2745#else 2746 return; 2747#endif 2748} 2749 2750/* 2751 * Handle a CPL_PASS_ACCEPT_REQ message. 2752 */ 2753static int 2754do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2755{ 2756 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; 2757 struct socket *lso = listen_ctx->lso; 2758 struct tom_data *d = listen_ctx->tom_data; 2759 2760#if VALIDATE_TID 2761 struct cpl_pass_accept_req *req = cplhdr(m); 2762 unsigned int tid = GET_TID(req); 2763 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; 2764 2765 if (unlikely(!lsk)) { 2766 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n", 2767 cdev->name, 2768 (unsigned long)((union listen_entry *)ctx - 2769 t->stid_tab)); 2770 return CPL_RET_BUF_DONE; 2771 } 2772 if (unlikely(tid >= t->ntids)) { 2773 printk(KERN_ERR "%s: passive open TID %u too large\n", 2774 cdev->name, tid); 2775 return CPL_RET_BUF_DONE; 2776 } 2777 /* 2778 * For T3A the current user of the TID may have closed but its last 2779 * message(s) may have been backlogged so the TID appears to be still 2780 * in use. Just take the TID away, the connection can close at its 2781 * own leisure. For T3B this situation is a bug. 2782 */ 2783 if (!valid_new_tid(t, tid) && 2784 cdev->type != T3A) { 2785 printk(KERN_ERR "%s: passive open uses existing TID %u\n", 2786 cdev->name, tid); 2787 return CPL_RET_BUF_DONE; 2788 } 2789#endif 2790 2791 process_pass_accept_req(lso, m, &d->tdev, listen_ctx); 2792 return (0); 2793} 2794 2795/* 2796 * Called when a connection is established to translate the TCP options 2797 * reported by HW to Linux's native format. 2798 */ 2799static void 2800assign_rxopt(struct socket *so, unsigned int opt) 2801{ 2802 const struct t3c_data *td = T3C_DATA(T3C_DEV(so)); 2803 struct tcpcb *tp = sototcpcb(so); 2804 struct toepcb *toep = tp->t_toe; 2805 2806 INP_LOCK_ASSERT(tp->t_inpcb); 2807 2808 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40; 2809 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0; 2810 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0; 2811 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0; 2812 if (tp->t_flags & TF_RCVD_SCALE) 2813 tp->rcv_scale = 0; 2814} 2815 2816/* 2817 * Completes some final bits of initialization for just established connections 2818 * and changes their state to TCP_ESTABLISHED. 2819 * 2820 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. 2821 */ 2822static void 2823make_established(struct socket *so, u32 snd_isn, unsigned int opt) 2824{ 2825 struct tcpcb *tp = sototcpcb(so); 2826 struct toepcb *toep = tp->t_toe; 2827 2828 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; 2829 assign_rxopt(so, opt); 2830 so->so_proto->pr_ctloutput = t3_ctloutput; 2831 2832#if 0 2833 inet_sk(sk)->id = tp->write_seq ^ jiffies; 2834#endif 2835 2836 2837 /* 2838 * XXX not clear what rcv_wup maps to 2839 */ 2840 /* 2841 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't 2842 * pass through opt0. 2843 */ 2844 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) 2845 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); 2846 2847 dump_toepcb(toep); 2848 2849#ifdef notyet 2850/* 2851 * no clean interface for marking ARP up to date 2852 */ 2853 dst_confirm(sk->sk_dst_cache); 2854#endif 2855 tp->t_state = TCPS_ESTABLISHED; 2856} 2857 2858static int 2859syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep) 2860{ 2861 2862 struct in_conninfo inc; 2863 struct tcpopt to; 2864 struct tcphdr th; 2865 int mss, wsf, sack, ts; 2866 struct mbuf *m = NULL; 2867 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev); 2868 unsigned int opt; 2869 2870#ifdef MAC 2871#error "no MAC support" 2872#endif 2873 2874 opt = ntohs(req->tcp_opt); 2875 2876 bzero(&to, sizeof(struct tcpopt)); 2877 2878 /* 2879 * Fill out information for entering us into the syncache 2880 */ 2881 inc.inc_fport = th.th_sport = req->peer_port; 2882 inc.inc_lport = th.th_dport = req->local_port; 2883 th.th_seq = req->rcv_isn; 2884 th.th_flags = TH_ACK; 2885 2886 inc.inc_isipv6 = 0; 2887 inc.inc_len = 0; 2888 inc.inc_faddr.s_addr = req->peer_ip; 2889 inc.inc_laddr.s_addr = req->local_ip; 2890 2891 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40; 2892 wsf = G_TCPOPT_WSCALE_OK(opt); 2893 ts = G_TCPOPT_TSTAMP(opt); 2894 sack = G_TCPOPT_SACK(opt); 2895 2896 to.to_mss = mss; 2897 to.to_wscale = G_TCPOPT_SND_WSCALE(opt); 2898 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 2899 2900 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n", 2901 ntohl(req->local_ip), ntohs(req->local_port), 2902 ntohl(req->peer_ip), ntohs(req->peer_port), 2903 mss, wsf, ts, sack); 2904 return syncache_expand(&inc, &to, &th, so, m); 2905} 2906 2907 2908/* 2909 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work 2910 * if we are in TCP_SYN_RECV due to crossed SYNs 2911 */ 2912static int 2913do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2914{ 2915 struct cpl_pass_establish *req = cplhdr(m); 2916 struct toepcb *toep = (struct toepcb *)ctx; 2917 struct tcpcb *tp; 2918 struct socket *so, *lso; 2919 struct t3c_data *td = T3C_DATA(cdev); 2920 // Complete socket initialization now that we have the SND_ISN 2921 2922 struct toedev *tdev; 2923 2924 so = lso = toeptoso(toep); 2925 tdev = toep->tp_toedev; 2926 2927 SOCK_LOCK(so); 2928 LIST_REMOVE(toep, synq_entry); 2929 SOCK_UNLOCK(so); 2930 2931 INP_INFO_WLOCK(&tcbinfo); 2932 if (!syncache_expand_establish_req(req, &so, toep)) { 2933 /* 2934 * No entry 2935 */ 2936 UNIMPLEMENTED(); 2937 } 2938 if (so == NULL) { 2939 /* 2940 * Couldn't create the socket 2941 */ 2942 UNIMPLEMENTED(); 2943 } 2944 2945 /* 2946 * XXX workaround for lack of syncache drop 2947 */ 2948 toepcb_release(toep); 2949 2950 tp = sototcpcb(so); 2951 INP_LOCK(tp->t_inpcb); 2952#ifdef notyet 2953 so->so_snd.sb_flags |= SB_TOE; 2954 so->so_rcv.sb_flags |= SB_TOE; 2955#endif 2956 toep->tp_tp = tp; 2957 toep->tp_flags = 0; 2958 tp->t_toe = toep; 2959 reset_wr_list(toep); 2960 tp->rcv_wnd = select_rcv_wnd(so); 2961 DPRINTF("rcv_wnd=%ld\n", tp->rcv_wnd); 2962 install_offload_ops(so); 2963 2964 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs); 2965 toep->tp_wr_unacked = 0; 2966 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 2967 toep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) && 2968 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 2969 toep->tp_qset_idx = 0; 2970 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu); 2971 2972 /* 2973 * XXX Cancel any keep alive timer 2974 */ 2975 2976 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 2977 INP_INFO_WUNLOCK(&tcbinfo); 2978 INP_UNLOCK(tp->t_inpcb); 2979 soisconnected(so); 2980 2981#ifdef notyet 2982 /* 2983 * XXX not sure how these checks map to us 2984 */ 2985 if (unlikely(sk->sk_socket)) { // simultaneous opens only 2986 sk->sk_state_change(sk); 2987 sk_wake_async(so, 0, POLL_OUT); 2988 } 2989 /* 2990 * The state for the new connection is now up to date. 2991 * Next check if we should add the connection to the parent's 2992 * accept queue. When the parent closes it resets connections 2993 * on its SYN queue, so check if we are being reset. If so we 2994 * don't need to do anything more, the coming ABORT_RPL will 2995 * destroy this socket. Otherwise move the connection to the 2996 * accept queue. 2997 * 2998 * Note that we reset the synq before closing the server so if 2999 * we are not being reset the stid is still open. 3000 */ 3001 if (unlikely(!tp->forward_skb_hint)) { // removed from synq 3002 __kfree_skb(skb); 3003 goto unlock; 3004 } 3005#endif 3006 m_free(m); 3007 3008 return (0); 3009} 3010 3011/* 3012 * Fill in the right TID for CPL messages waiting in the out-of-order queue 3013 * and send them to the TOE. 3014 */ 3015static void 3016fixup_and_send_ofo(struct socket *so) 3017{ 3018 struct mbuf *m; 3019 struct toedev *tdev = TOE_DEV(so); 3020 struct tcpcb *tp = sototcpcb(so); 3021 struct toepcb *toep = tp->t_toe; 3022 unsigned int tid = toep->tp_tid; 3023 3024 printf("fixup_and_send_ofo\n"); 3025 3026 INP_LOCK_ASSERT(tp->t_inpcb); 3027 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { 3028 /* 3029 * A variety of messages can be waiting but the fields we'll 3030 * be touching are common to all so any message type will do. 3031 */ 3032 struct cpl_close_con_req *p = cplhdr(m); 3033 3034 p->wr.wr_lo = htonl(V_WR_TID(tid)); 3035 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); 3036 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3037 } 3038} 3039 3040/* 3041 * Updates socket state from an active establish CPL message. Runs with the 3042 * socket lock held. 3043 */ 3044static void 3045socket_act_establish(struct socket *so, struct mbuf *m) 3046{ 3047 struct cpl_act_establish *req = cplhdr(m); 3048 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ 3049 struct tcpcb *tp = sototcpcb(so); 3050 struct toepcb *toep = tp->t_toe; 3051 3052 if (__predict_false(tp->t_state != TCPS_SYN_SENT)) 3053 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n", 3054 toep->tp_tid, tp->t_state); 3055 3056 tp->ts_recent_age = ticks; 3057 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn; 3058 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs; 3059 3060 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3061 3062 /* 3063 * Now that we finally have a TID send any CPL messages that we had to 3064 * defer for lack of a TID. 3065 */ 3066 if (mbufq_len(&toep->out_of_order_queue)) 3067 fixup_and_send_ofo(so); 3068 3069 if (__predict_false(so->so_state & SS_NOFDREF)) { 3070#ifdef notyet 3071 /* 3072 * XXX not clear what should be done here 3073 * appears to correspond to sorwakeup_locked 3074 */ 3075 sk->sk_state_change(sk); 3076 sk_wake_async(so, 0, POLL_OUT); 3077#endif 3078 } 3079 m_free(m); 3080#ifdef notyet 3081/* 3082 * XXX assume no write requests permitted while socket connection is 3083 * incomplete 3084 */ 3085 /* 3086 * Currently the send queue must be empty at this point because the 3087 * socket layer does not send anything before a connection is 3088 * established. To be future proof though we handle the possibility 3089 * that there are pending buffers to send (either TX_DATA or 3090 * CLOSE_CON_REQ). First we need to adjust the sequence number of the 3091 * buffers according to the just learned write_seq, and then we send 3092 * them on their way. 3093 */ 3094 fixup_pending_writeq_buffers(sk); 3095 if (t3_push_frames(so, 1)) 3096 sk->sk_write_space(sk); 3097#endif 3098 3099 soisconnected(so); 3100 toep->tp_state = tp->t_state = TCPS_ESTABLISHED; 3101 tcpstat.tcps_connects++; 3102 3103} 3104 3105/* 3106 * Process a CPL_ACT_ESTABLISH message. 3107 */ 3108static int 3109do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3110{ 3111 struct cpl_act_establish *req = cplhdr(m); 3112 unsigned int tid = GET_TID(req); 3113 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 3114 struct toepcb *toep = (struct toepcb *)ctx; 3115 struct tcpcb *tp = toep->tp_tp; 3116 struct socket *so; 3117 struct toedev *tdev; 3118 struct tom_data *d; 3119 3120 if (tp == NULL) { 3121 free_atid(cdev, atid); 3122 return (0); 3123 } 3124 3125 so = toeptoso(toep); 3126 tdev = TOE_DEV(so); /* blow up here if link was down */ 3127 d = TOM_DATA(tdev); 3128 3129 INP_LOCK(tp->t_inpcb); 3130 3131 /* 3132 * It's OK if the TID is currently in use, the owning socket may have 3133 * backlogged its last CPL message(s). Just take it away. 3134 */ 3135 toep->tp_tid = tid; 3136 toep->tp_tp = tp; 3137 so_insert_tid(d, so, tid); 3138 free_atid(cdev, atid); 3139 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3140 3141 socket_act_establish(so, m); 3142 INP_UNLOCK(tp->t_inpcb); 3143 return (0); 3144} 3145 3146/* 3147 * Process an acknowledgment of WR completion. Advance snd_una and send the 3148 * next batch of work requests from the write queue. 3149 */ 3150static void 3151wr_ack(struct toepcb *toep, struct mbuf *m) 3152{ 3153 struct tcpcb *tp = toep->tp_tp; 3154 struct cpl_wr_ack *hdr = cplhdr(m); 3155 struct socket *so = toeptoso(toep); 3156 unsigned int credits = ntohs(hdr->credits); 3157 u32 snd_una = ntohl(hdr->snd_una); 3158 int bytes = 0; 3159 3160 DPRINTF("wr_ack: snd_una=%u credits=%d\n", snd_una, credits); 3161 3162 INP_LOCK(tp->t_inpcb); 3163 3164 toep->tp_wr_avail += credits; 3165 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) 3166 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; 3167 3168 while (credits) { 3169 struct mbuf *p = peek_wr(toep); 3170 DPRINTF("p->credits=%d p->bytes=%d\n", p->m_pkthdr.csum_data, p->m_pkthdr.len) ; 3171 3172 if (__predict_false(!p)) { 3173 log(LOG_ERR, "%u WR_ACK credits for TID %u with " 3174 "nothing pending, state %u\n", 3175 credits, toep->tp_tid, tp->t_state); 3176 break; 3177 } 3178 if (__predict_false(credits < p->m_pkthdr.csum_data)) { 3179#if DEBUG_WR > 1 3180 struct tx_data_wr *w = cplhdr(p); 3181#ifdef notyet 3182 log(LOG_ERR, 3183 "TID %u got %u WR credits, need %u, len %u, " 3184 "main body %u, frags %u, seq # %u, ACK una %u," 3185 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", 3186 toep->tp_tid, credits, p->csum, p->len, 3187 p->len - p->data_len, skb_shinfo(p)->nr_frags, 3188 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), 3189 WR_AVAIL(tp), count_pending_wrs(tp) - credits); 3190#endif 3191#endif 3192 p->m_pkthdr.csum_data -= credits; 3193 break; 3194 } else { 3195 dequeue_wr(toep); 3196 credits -= p->m_pkthdr.csum_data; 3197 bytes += p->m_pkthdr.len; 3198 DPRINTF("done with wr of %d bytes\n", p->m_pkthdr.len); 3199 3200 m_free(p); 3201 } 3202 } 3203 3204#if DEBUG_WR 3205 check_wr_invariants(tp); 3206#endif 3207 3208 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 3209#if VALIDATE_SEQ 3210 struct tom_data *d = TOM_DATA(TOE_DEV(so)); 3211 3212 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK " 3213 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una, 3214 toep->tp_tid, tp->snd_una); 3215#endif 3216 goto out_free; 3217 } 3218 3219 if (tp->snd_una != snd_una) { 3220 tp->snd_una = snd_una; 3221 tp->ts_recent_age = ticks; 3222#ifdef notyet 3223 /* 3224 * Keep ARP entry "minty fresh" 3225 */ 3226 dst_confirm(sk->sk_dst_cache); 3227#endif 3228 if (tp->snd_una == tp->snd_nxt) 3229 toep->tp_flags &= ~TP_TX_WAIT_IDLE; 3230 } 3231 if (bytes) { 3232 DPRINTF("sbdrop(%d)\n", bytes); 3233 SOCKBUF_LOCK(&so->so_snd); 3234 sbdrop_locked(&so->so_snd, bytes); 3235 sowwakeup_locked(so); 3236 } 3237 3238 if (so->so_snd.sb_sndptroff < so->so_snd.sb_cc) 3239 t3_push_frames(so, 0); 3240 3241out_free: 3242 INP_UNLOCK(tp->t_inpcb); 3243 m_free(m); 3244} 3245 3246/* 3247 * Handler for TX_DATA_ACK CPL messages. 3248 */ 3249static int 3250do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx) 3251{ 3252 struct toepcb *toep = (struct toepcb *)ctx; 3253 3254 DPRINTF("do_wr_ack\n"); 3255 dump_toepcb(toep); 3256 3257 VALIDATE_SOCK(so); 3258 3259 wr_ack(toep, m); 3260 return 0; 3261} 3262 3263 3264/* 3265 * Reset a connection that is on a listener's SYN queue or accept queue, 3266 * i.e., one that has not had a struct socket associated with it. 3267 * Must be called from process context. 3268 * 3269 * Modeled after code in inet_csk_listen_stop(). 3270 */ 3271static void 3272t3_reset_listen_child(struct socket *child) 3273{ 3274 struct tcpcb *tp = sototcpcb(child); 3275 3276 t3_send_reset(tp->t_toe); 3277} 3278 3279/* 3280 * Disconnect offloaded established but not yet accepted connections sitting 3281 * on a server's accept_queue. We just send an ABORT_REQ at this point and 3282 * finish off the disconnect later as we may need to wait for the ABORT_RPL. 3283 */ 3284void 3285t3_disconnect_acceptq(struct socket *listen_so) 3286{ 3287 struct socket *so; 3288 struct tcpcb *tp; 3289 3290 TAILQ_FOREACH(so, &listen_so->so_comp, so_list) { 3291 tp = sototcpcb(so); 3292 3293 if (tp->t_flags & TF_TOE) { 3294 INP_LOCK(tp->t_inpcb); 3295 t3_reset_listen_child(so); 3296 INP_UNLOCK(tp->t_inpcb); 3297 } 3298 3299 } 3300} 3301 3302/* 3303 * Reset offloaded connections sitting on a server's syn queue. As above 3304 * we send ABORT_REQ and finish off when we get ABORT_RPL. 3305 */ 3306 3307void 3308t3_reset_synq(struct listen_ctx *lctx) 3309{ 3310 struct toepcb *toep; 3311 3312 SOCK_LOCK(lctx->lso); 3313 while (!LIST_EMPTY(&lctx->synq_head)) { 3314 toep = LIST_FIRST(&lctx->synq_head); 3315 LIST_REMOVE(toep, synq_entry); 3316 toep->tp_tp = NULL; 3317 t3_send_reset(toep); 3318 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid); 3319 toepcb_release(toep); 3320 } 3321 SOCK_UNLOCK(lctx->lso); 3322} 3323 3324void 3325t3_init_wr_tab(unsigned int wr_len) 3326{ 3327 int i; 3328 3329 if (mbuf_wrs[1]) /* already initialized */ 3330 return; 3331 3332 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) { 3333 int sgl_len = (3 * i) / 2 + (i & 1); 3334 3335 sgl_len += 3; 3336 mbuf_wrs[i] = sgl_len <= wr_len ? 3337 1 : 1 + (sgl_len - 2) / (wr_len - 1); 3338 } 3339 3340 wrlen = wr_len * 8; 3341} 3342 3343int 3344t3_init_cpl_io(void) 3345{ 3346#ifdef notyet 3347 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); 3348 if (!tcphdr_skb) { 3349 log(LOG_ERR, 3350 "Chelsio TCP offload: can't allocate sk_buff\n"); 3351 return -1; 3352 } 3353 skb_put(tcphdr_skb, sizeof(struct tcphdr)); 3354 tcphdr_skb->h.raw = tcphdr_skb->data; 3355 memset(tcphdr_skb->data, 0, tcphdr_skb->len); 3356#endif 3357 3358 3359 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); 3360 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); 3361 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); 3362 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); 3363 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 3364 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 3365 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 3366 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 3367 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 3368 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); 3369 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); 3370 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 3371#ifdef notyet 3372 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); 3373 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); 3374 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); 3375#endif 3376 return (0); 3377} 3378 3379