cxgb_cpl_io.c revision 178302
1/************************************************************************** 2 3Copyright (c) 2007-2008, Chelsio Inc. 4All rights reserved. 5 6Redistribution and use in source and binary forms, with or without 7modification, are permitted provided that the following conditions are met: 8 9 1. Redistributions of source code must retain the above copyright notice, 10 this list of conditions and the following disclaimer. 11 12 2. Neither the name of the Chelsio Corporation nor the names of its 13 contributors may be used to endorse or promote products derived from 14 this software without specific prior written permission. 15 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26POSSIBILITY OF SUCH DAMAGE. 27 28***************************************************************************/ 29 30#include <sys/cdefs.h> 31__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 178302 2008-04-19 03:22:43Z kmacy $"); 32 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/fcntl.h> 36#include <sys/kernel.h> 37#include <sys/limits.h> 38#include <sys/ktr.h> 39#include <sys/lock.h> 40#include <sys/mbuf.h> 41#include <sys/mutex.h> 42#include <sys/socket.h> 43#include <sys/sysctl.h> 44#include <sys/syslog.h> 45#include <sys/protosw.h> 46#include <sys/priv.h> 47 48#include <net/if.h> 49#include <net/route.h> 50 51#include <netinet/in.h> 52#include <netinet/in_pcb.h> 53#include <netinet/in_systm.h> 54#include <netinet/in_var.h> 55 56 57#include <dev/cxgb/cxgb_osdep.h> 58#include <dev/cxgb/sys/mbufq.h> 59 60#include <netinet/ip.h> 61#include <netinet/tcp_var.h> 62#include <netinet/tcp_fsm.h> 63#include <netinet/tcp_offload.h> 64#include <netinet/tcp_seq.h> 65#include <netinet/tcp_syncache.h> 66#include <netinet/tcp_timer.h> 67#include <net/route.h> 68 69#include <dev/cxgb/t3cdev.h> 70#include <dev/cxgb/common/cxgb_firmware_exports.h> 71#include <dev/cxgb/common/cxgb_t3_cpl.h> 72#include <dev/cxgb/common/cxgb_tcb.h> 73#include <dev/cxgb/common/cxgb_ctl_defs.h> 74#include <dev/cxgb/cxgb_offload.h> 75#include <vm/vm.h> 76#include <vm/pmap.h> 77#include <machine/bus.h> 78#include <dev/cxgb/sys/mvec.h> 79#include <dev/cxgb/ulp/toecore/cxgb_toedev.h> 80#include <dev/cxgb/ulp/tom/cxgb_defs.h> 81#include <dev/cxgb/ulp/tom/cxgb_tom.h> 82#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h> 83#include <dev/cxgb/ulp/tom/cxgb_toepcb.h> 84#include <dev/cxgb/ulp/tom/cxgb_tcp.h> 85 86#include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h> 87 88/* 89 * For ULP connections HW may add headers, e.g., for digests, that aren't part 90 * of the messages sent by the host but that are part of the TCP payload and 91 * therefore consume TCP sequence space. Tx connection parameters that 92 * operate in TCP sequence space are affected by the HW additions and need to 93 * compensate for them to accurately track TCP sequence numbers. This array 94 * contains the compensating extra lengths for ULP packets. It is indexed by 95 * a packet's ULP submode. 96 */ 97const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; 98 99#ifdef notyet 100/* 101 * This sk_buff holds a fake header-only TCP segment that we use whenever we 102 * need to exploit SW TCP functionality that expects TCP headers, such as 103 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple 104 * CPUs without locking. 105 */ 106static struct mbuf *tcphdr_mbuf __read_mostly; 107#endif 108 109/* 110 * Size of WRs in bytes. Note that we assume all devices we are handling have 111 * the same WR size. 112 */ 113static unsigned int wrlen __read_mostly; 114 115/* 116 * The number of WRs needed for an skb depends on the number of page fragments 117 * in the skb and whether it has any payload in its main body. This maps the 118 * length of the gather list represented by an skb into the # of necessary WRs. 119 */ 120static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly; 121 122/* 123 * Max receive window supported by HW in bytes. Only a small part of it can 124 * be set through option0, the rest needs to be set through RX_DATA_ACK. 125 */ 126#define MAX_RCV_WND ((1U << 27) - 1) 127 128/* 129 * Min receive window. We want it to be large enough to accommodate receive 130 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. 131 */ 132#define MIN_RCV_WND (24 * 1024U) 133#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS) 134 135#define VALIDATE_SEQ 0 136#define VALIDATE_SOCK(so) 137#define DEBUG_WR 0 138 139#define TCP_TIMEWAIT 1 140#define TCP_CLOSE 2 141#define TCP_DROP 3 142 143extern int tcp_do_autorcvbuf; 144extern int tcp_do_autosndbuf; 145extern int tcp_autorcvbuf_max; 146extern int tcp_autosndbuf_max; 147 148static void t3_send_reset(struct toepcb *toep); 149static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); 150static inline void free_atid(struct t3cdev *cdev, unsigned int tid); 151static void handle_syncache_event(int event, void *arg); 152 153static inline void 154SBAPPEND(struct sockbuf *sb, struct mbuf *n) 155{ 156 struct mbuf *m; 157 158 m = sb->sb_mb; 159 while (m) { 160 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || 161 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", 162 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); 163 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 164 m->m_next, m->m_nextpkt, m->m_flags)); 165 m = m->m_next; 166 } 167 m = n; 168 while (m) { 169 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || 170 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", 171 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); 172 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 173 m->m_next, m->m_nextpkt, m->m_flags)); 174 m = m->m_next; 175 } 176 sbappend_locked(sb, n); 177 m = sb->sb_mb; 178 179 while (m) { 180 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 181 m->m_next, m->m_nextpkt, m->m_flags)); 182 m = m->m_next; 183 } 184} 185 186static inline int 187is_t3a(const struct toedev *dev) 188{ 189 return (dev->tod_ttid == TOE_ID_CHELSIO_T3); 190} 191 192static void 193dump_toepcb(struct toepcb *toep) 194{ 195 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n", 196 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode, 197 toep->tp_mtu_idx, toep->tp_tid); 198 199 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n", 200 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, 201 toep->tp_mss_clamp, toep->tp_flags); 202} 203 204#ifndef RTALLOC2_DEFINED 205static struct rtentry * 206rtalloc2(struct sockaddr *dst, int report, u_long ignflags) 207{ 208 struct rtentry *rt = NULL; 209 210 if ((rt = rtalloc1(dst, report, ignflags)) != NULL) 211 RT_UNLOCK(rt); 212 213 return (rt); 214} 215#endif 216 217/* 218 * Determine whether to send a CPL message now or defer it. A message is 219 * deferred if the connection is in SYN_SENT since we don't know the TID yet. 220 * For connections in other states the message is sent immediately. 221 * If through_l2t is set the message is subject to ARP processing, otherwise 222 * it is sent directly. 223 */ 224static inline void 225send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t) 226{ 227 struct tcpcb *tp = toep->tp_tp; 228 229 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) { 230 inp_wlock(tp->t_inpcb); 231 mbufq_tail(&toep->out_of_order_queue, m); // defer 232 inp_wunlock(tp->t_inpcb); 233 } else if (through_l2t) 234 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T 235 else 236 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly 237} 238 239static inline unsigned int 240mkprio(unsigned int cntrl, const struct toepcb *toep) 241{ 242 return (cntrl); 243} 244 245/* 246 * Populate a TID_RELEASE WR. The skb must be already propely sized. 247 */ 248static inline void 249mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid) 250{ 251 struct cpl_tid_release *req; 252 253 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep)); 254 m->m_pkthdr.len = m->m_len = sizeof(*req); 255 req = mtod(m, struct cpl_tid_release *); 256 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 257 req->wr.wr_lo = 0; 258 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); 259} 260 261static inline void 262make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) 263{ 264 struct tcpcb *tp = so_sototcpcb(so); 265 struct toepcb *toep = tp->t_toe; 266 struct tx_data_wr *req; 267 struct sockbuf *snd; 268 269 inp_lock_assert(tp->t_inpcb); 270 snd = so_sockbuf_snd(so); 271 272 req = mtod(m, struct tx_data_wr *); 273 m->m_len = sizeof(*req); 274 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 275 req->wr_lo = htonl(V_WR_TID(toep->tp_tid)); 276 /* len includes the length of any HW ULP additions */ 277 req->len = htonl(len); 278 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); 279 /* V_TX_ULP_SUBMODE sets both the mode and submode */ 280 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) | 281 V_TX_URG(/* skb_urgent(skb) */ 0 ) | 282 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) && 283 (tail ? 0 : 1)))); 284 req->sndseq = htonl(tp->snd_nxt); 285 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { 286 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 287 V_TX_CPU_IDX(toep->tp_qset)); 288 289 /* Sendbuffer is in units of 32KB. 290 */ 291 if (tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 292 req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15)); 293 else { 294 req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15)); 295 } 296 297 toep->tp_flags |= TP_DATASENT; 298 } 299} 300 301#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */ 302 303int 304t3_push_frames(struct socket *so, int req_completion) 305{ 306 struct tcpcb *tp = so_sototcpcb(so); 307 struct toepcb *toep = tp->t_toe; 308 309 struct mbuf *tail, *m0, *last; 310 struct t3cdev *cdev; 311 struct tom_data *d; 312 int state, bytes, count, total_bytes; 313 bus_dma_segment_t segs[TX_MAX_SEGS], *segp; 314 struct sockbuf *snd; 315 316 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) { 317 DPRINTF("tcp state=%d\n", tp->t_state); 318 return (0); 319 } 320 321 state = so_state_get(so); 322 323 if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) { 324 DPRINTF("disconnecting\n"); 325 326 return (0); 327 } 328 329 inp_lock_assert(tp->t_inpcb); 330 331 snd = so_sockbuf_snd(so); 332 sockbuf_lock(snd); 333 334 d = TOM_DATA(toep->tp_toedev); 335 cdev = d->cdev; 336 337 last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; 338 339 total_bytes = 0; 340 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n", 341 toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last); 342 343 if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) { 344 KASSERT(tail, ("sbdrop error")); 345 last = tail = tail->m_next; 346 } 347 348 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) { 349 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail); 350 sockbuf_unlock(snd); 351 352 return (0); 353 } 354 355 toep->tp_m_last = NULL; 356 while (toep->tp_wr_avail && (tail != NULL)) { 357 count = bytes = 0; 358 segp = segs; 359 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) { 360 sockbuf_unlock(snd); 361 return (0); 362 } 363 /* 364 * If the data in tail fits as in-line, then 365 * make an immediate data wr. 366 */ 367 if (tail->m_len <= IMM_LEN) { 368 count = 1; 369 bytes = tail->m_len; 370 last = tail; 371 tail = tail->m_next; 372 m_set_sgl(m0, NULL); 373 m_set_sgllen(m0, 0); 374 make_tx_data_wr(so, m0, bytes, tail); 375 m_append(m0, bytes, mtod(last, caddr_t)); 376 KASSERT(!m0->m_next, ("bad append")); 377 } else { 378 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) 379 && (tail != NULL) && (count < TX_MAX_SEGS-1)) { 380 bytes += tail->m_len; 381 last = tail; 382 count++; 383 /* 384 * technically an abuse to be using this for a VA 385 * but less gross than defining my own structure 386 * or calling pmap_kextract from here :-| 387 */ 388 segp->ds_addr = (bus_addr_t)tail->m_data; 389 segp->ds_len = tail->m_len; 390 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", 391 count, mbuf_wrs[count], tail->m_data, tail->m_len); 392 segp++; 393 tail = tail->m_next; 394 } 395 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", 396 toep->tp_wr_avail, count, mbuf_wrs[count], tail); 397 398 m_set_sgl(m0, segs); 399 m_set_sgllen(m0, count); 400 make_tx_data_wr(so, m0, bytes, tail); 401 } 402 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep)); 403 404 if (tail) { 405 snd->sb_sndptr = tail; 406 toep->tp_m_last = NULL; 407 } else 408 toep->tp_m_last = snd->sb_sndptr = last; 409 410 411 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last); 412 413 snd->sb_sndptroff += bytes; 414 total_bytes += bytes; 415 toep->tp_write_seq += bytes; 416 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d tail=%p sndptr=%p sndptroff=%d", 417 toep->tp_wr_avail, count, mbuf_wrs[count], tail, snd->sb_sndptr, snd->sb_sndptroff); 418 if (tail) 419 CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p tailbuf=%p snd_una=0x%08x", 420 total_bytes, toep->tp_m_last, tail->m_data, tp->snd_una); 421 else 422 CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p snd_una=0x%08x", 423 total_bytes, toep->tp_m_last, tp->snd_una); 424 425 426#ifdef KTR 427{ 428 int i; 429 430 i = 0; 431 while (i < count && m_get_sgllen(m0)) { 432 if ((count - i) >= 3) { 433 CTR6(KTR_TOM, 434 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d pa=0x%zx len=%d", 435 segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len, 436 segs[i + 2].ds_addr, segs[i + 2].ds_len); 437 i += 3; 438 } else if ((count - i) == 2) { 439 CTR4(KTR_TOM, 440 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d", 441 segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len); 442 i += 2; 443 } else { 444 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d", 445 segs[i].ds_addr, segs[i].ds_len); 446 i++; 447 } 448 449 } 450} 451#endif 452 /* 453 * remember credits used 454 */ 455 m0->m_pkthdr.csum_data = mbuf_wrs[count]; 456 m0->m_pkthdr.len = bytes; 457 toep->tp_wr_avail -= mbuf_wrs[count]; 458 toep->tp_wr_unacked += mbuf_wrs[count]; 459 460 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) || 461 toep->tp_wr_unacked >= toep->tp_wr_max / 2) { 462 struct work_request_hdr *wr = cplhdr(m0); 463 464 wr->wr_hi |= htonl(F_WR_COMPL); 465 toep->tp_wr_unacked = 0; 466 } 467 KASSERT((m0->m_pkthdr.csum_data > 0) && 468 (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d", 469 m0->m_pkthdr.csum_data)); 470 m0->m_type = MT_DONTFREE; 471 enqueue_wr(toep, m0); 472 DPRINTF("sending offload tx with %d bytes in %d segments\n", 473 bytes, count); 474 l2t_send(cdev, m0, toep->tp_l2t); 475 } 476 sockbuf_unlock(snd); 477 return (total_bytes); 478} 479 480/* 481 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail 482 * under any circumstances. We take the easy way out and always queue the 483 * message to the write_queue. We can optimize the case where the queue is 484 * already empty though the optimization is probably not worth it. 485 */ 486static void 487close_conn(struct socket *so) 488{ 489 struct mbuf *m; 490 struct cpl_close_con_req *req; 491 struct tom_data *d; 492 struct inpcb *inp = so_sotoinpcb(so); 493 struct tcpcb *tp; 494 struct toepcb *toep; 495 unsigned int tid; 496 497 498 inp_wlock(inp); 499 tp = so_sototcpcb(so); 500 toep = tp->t_toe; 501 502 if (tp->t_state != TCPS_SYN_SENT) 503 t3_push_frames(so, 1); 504 505 if (toep->tp_flags & TP_FIN_SENT) { 506 inp_wunlock(inp); 507 return; 508 } 509 510 tid = toep->tp_tid; 511 512 d = TOM_DATA(toep->tp_toedev); 513 514 m = m_gethdr_nofail(sizeof(*req)); 515 m_set_priority(m, CPL_PRIORITY_DATA); 516 m_set_sgl(m, NULL); 517 m_set_sgllen(m, 0); 518 519 toep->tp_flags |= TP_FIN_SENT; 520 req = mtod(m, struct cpl_close_con_req *); 521 522 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); 523 req->wr.wr_lo = htonl(V_WR_TID(tid)); 524 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 525 req->rsvd = 0; 526 inp_wunlock(inp); 527 /* 528 * XXX - need to defer shutdown while there is still data in the queue 529 * 530 */ 531 CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid); 532 cxgb_ofld_send(d->cdev, m); 533 534} 535 536/* 537 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant 538 * and send it along. 539 */ 540static void 541abort_arp_failure(struct t3cdev *cdev, struct mbuf *m) 542{ 543 struct cpl_abort_req *req = cplhdr(m); 544 545 req->cmd = CPL_ABORT_NO_RST; 546 cxgb_ofld_send(cdev, m); 547} 548 549/* 550 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are 551 * permitted to return without sending the message in case we cannot allocate 552 * an sk_buff. Returns the number of credits sent. 553 */ 554uint32_t 555t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail) 556{ 557 struct mbuf *m; 558 struct cpl_rx_data_ack *req; 559 struct toepcb *toep = tp->t_toe; 560 struct toedev *tdev = toep->tp_toedev; 561 562 m = m_gethdr_nofail(sizeof(*req)); 563 564 DPRINTF("returning %u credits to HW\n", credits); 565 566 req = mtod(m, struct cpl_rx_data_ack *); 567 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 568 req->wr.wr_lo = 0; 569 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 570 req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); 571 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep)); 572 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 573 return (credits); 574} 575 576/* 577 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled. 578 * This is only used in DDP mode, so we take the opportunity to also set the 579 * DACK mode and flush any Rx credits. 580 */ 581void 582t3_send_rx_modulate(struct toepcb *toep) 583{ 584 struct mbuf *m; 585 struct cpl_rx_data_ack *req; 586 587 m = m_gethdr_nofail(sizeof(*req)); 588 589 req = mtod(m, struct cpl_rx_data_ack *); 590 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 591 req->wr.wr_lo = 0; 592 m->m_pkthdr.len = m->m_len = sizeof(*req); 593 594 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 595 req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | 596 V_RX_DACK_MODE(1) | 597 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup)); 598 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 599 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 600 toep->tp_rcv_wup = toep->tp_copied_seq; 601} 602 603/* 604 * Handle receipt of an urgent pointer. 605 */ 606static void 607handle_urg_ptr(struct socket *so, uint32_t urg_seq) 608{ 609#ifdef URGENT_DATA_SUPPORTED 610 struct tcpcb *tp = so_sototcpcb(so); 611 612 urg_seq--; /* initially points past the urgent data, per BSD */ 613 614 if (tp->urg_data && !after(urg_seq, tp->urg_seq)) 615 return; /* duplicate pointer */ 616 sk_send_sigurg(sk); 617 if (tp->urg_seq == tp->copied_seq && tp->urg_data && 618 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { 619 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 620 621 tp->copied_seq++; 622 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len) 623 tom_eat_skb(sk, skb, 0); 624 } 625 tp->urg_data = TCP_URG_NOTYET; 626 tp->urg_seq = urg_seq; 627#endif 628} 629 630/* 631 * Returns true if a socket cannot accept new Rx data. 632 */ 633static inline int 634so_no_receive(const struct socket *so) 635{ 636 return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING)); 637} 638 639/* 640 * Process an urgent data notification. 641 */ 642static void 643rx_urg_notify(struct toepcb *toep, struct mbuf *m) 644{ 645 struct cpl_rx_urg_notify *hdr = cplhdr(m); 646 struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 647 648 VALIDATE_SOCK(so); 649 650 if (!so_no_receive(so)) 651 handle_urg_ptr(so, ntohl(hdr->seq)); 652 653 m_freem(m); 654} 655 656/* 657 * Handler for RX_URG_NOTIFY CPL messages. 658 */ 659static int 660do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx) 661{ 662 struct toepcb *toep = (struct toepcb *)ctx; 663 664 rx_urg_notify(toep, m); 665 return (0); 666} 667 668static __inline int 669is_delack_mode_valid(struct toedev *dev, struct toepcb *toep) 670{ 671 return (toep->tp_ulp_mode || 672 (toep->tp_ulp_mode == ULP_MODE_TCPDDP && 673 dev->tod_ttid >= TOE_ID_CHELSIO_T3)); 674} 675 676/* 677 * Set of states for which we should return RX credits. 678 */ 679#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) 680 681/* 682 * Called after some received data has been read. It returns RX credits 683 * to the HW for the amount of data processed. 684 */ 685void 686t3_cleanup_rbuf(struct tcpcb *tp, int copied) 687{ 688 struct toepcb *toep = tp->t_toe; 689 struct socket *so; 690 struct toedev *dev; 691 int dack_mode, must_send, read; 692 u32 thres, credits, dack = 0; 693 struct sockbuf *rcv; 694 695 so = inp_inpcbtosocket(tp->t_inpcb); 696 rcv = so_sockbuf_rcv(so); 697 698 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || 699 (tp->t_state == TCPS_FIN_WAIT_2))) { 700 if (copied) { 701 sockbuf_lock(rcv); 702 toep->tp_copied_seq += copied; 703 sockbuf_unlock(rcv); 704 } 705 706 return; 707 } 708 709 inp_lock_assert(tp->t_inpcb); 710 711 sockbuf_lock(rcv); 712 if (copied) 713 toep->tp_copied_seq += copied; 714 else { 715 read = toep->tp_enqueued_bytes - rcv->sb_cc; 716 toep->tp_copied_seq += read; 717 } 718 credits = toep->tp_copied_seq - toep->tp_rcv_wup; 719 toep->tp_enqueued_bytes = rcv->sb_cc; 720 sockbuf_unlock(rcv); 721 722 if (credits > rcv->sb_mbmax) { 723 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n", 724 toep->tp_copied_seq, toep->tp_rcv_wup, credits); 725 credits = rcv->sb_mbmax; 726 } 727 728 729 /* 730 * XXX this won't accurately reflect credit return - we need 731 * to look at the difference between the amount that has been 732 * put in the recv sockbuf and what is there now 733 */ 734 735 if (__predict_false(!credits)) 736 return; 737 738 dev = toep->tp_toedev; 739 thres = TOM_TUNABLE(dev, rx_credit_thres); 740 741 if (__predict_false(thres == 0)) 742 return; 743 744 if (is_delack_mode_valid(dev, toep)) { 745 dack_mode = TOM_TUNABLE(dev, delack); 746 if (__predict_false(dack_mode != toep->tp_delack_mode)) { 747 u32 r = tp->rcv_nxt - toep->tp_delack_seq; 748 749 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp) 750 dack = F_RX_DACK_CHANGE | 751 V_RX_DACK_MODE(dack_mode); 752 } 753 } else 754 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 755 756 /* 757 * For coalescing to work effectively ensure the receive window has 758 * at least 16KB left. 759 */ 760 must_send = credits + 16384 >= tp->rcv_wnd; 761 762 if (must_send || credits >= thres) 763 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send); 764} 765 766static int 767cxgb_toe_disconnect(struct tcpcb *tp) 768{ 769 struct socket *so; 770 771 DPRINTF("cxgb_toe_disconnect\n"); 772 773 so = inp_inpcbtosocket(tp->t_inpcb); 774 close_conn(so); 775 return (0); 776} 777 778static int 779cxgb_toe_reset(struct tcpcb *tp) 780{ 781 struct toepcb *toep = tp->t_toe; 782 783 t3_send_reset(toep); 784 785 /* 786 * unhook from socket 787 */ 788 tp->t_flags &= ~TF_TOE; 789 toep->tp_tp = NULL; 790 tp->t_toe = NULL; 791 return (0); 792} 793 794static int 795cxgb_toe_send(struct tcpcb *tp) 796{ 797 struct socket *so; 798 799 DPRINTF("cxgb_toe_send\n"); 800 dump_toepcb(tp->t_toe); 801 802 so = inp_inpcbtosocket(tp->t_inpcb); 803 t3_push_frames(so, 1); 804 return (0); 805} 806 807static int 808cxgb_toe_rcvd(struct tcpcb *tp) 809{ 810 811 inp_lock_assert(tp->t_inpcb); 812 813 t3_cleanup_rbuf(tp, 0); 814 815 return (0); 816} 817 818static void 819cxgb_toe_detach(struct tcpcb *tp) 820{ 821 struct toepcb *toep; 822 823 /* 824 * XXX how do we handle teardown in the SYN_SENT state? 825 * 826 */ 827 inp_lock_assert(tp->t_inpcb); 828 toep = tp->t_toe; 829 toep->tp_tp = NULL; 830 831 /* 832 * unhook from socket 833 */ 834 tp->t_flags &= ~TF_TOE; 835 tp->t_toe = NULL; 836} 837 838 839static struct toe_usrreqs cxgb_toe_usrreqs = { 840 .tu_disconnect = cxgb_toe_disconnect, 841 .tu_reset = cxgb_toe_reset, 842 .tu_send = cxgb_toe_send, 843 .tu_rcvd = cxgb_toe_rcvd, 844 .tu_detach = cxgb_toe_detach, 845 .tu_detach = cxgb_toe_detach, 846 .tu_syncache_event = handle_syncache_event, 847}; 848 849 850static void 851__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word, 852 uint64_t mask, uint64_t val, int no_reply) 853{ 854 struct cpl_set_tcb_field *req; 855 856 CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", 857 toep->tp_tid, word, mask, val); 858 859 req = mtod(m, struct cpl_set_tcb_field *); 860 m->m_pkthdr.len = m->m_len = sizeof(*req); 861 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 862 req->wr.wr_lo = 0; 863 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid)); 864 req->reply = V_NO_REPLY(no_reply); 865 req->cpu_idx = 0; 866 req->word = htons(word); 867 req->mask = htobe64(mask); 868 req->val = htobe64(val); 869 870 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 871 send_or_defer(toep, m, 0); 872} 873 874static void 875t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val) 876{ 877 struct mbuf *m; 878 struct tcpcb *tp = toep->tp_tp; 879 880 if (toep == NULL) 881 return; 882 883 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) { 884 printf("not seting field\n"); 885 return; 886 } 887 888 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field)); 889 890 __set_tcb_field(toep, m, word, mask, val, 1); 891} 892 893/* 894 * Set one of the t_flags bits in the TCB. 895 */ 896static void 897set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val) 898{ 899 900 t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos); 901} 902 903/* 904 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. 905 */ 906static void 907t3_set_nagle(struct toepcb *toep) 908{ 909 struct tcpcb *tp = toep->tp_tp; 910 911 set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY)); 912} 913 914/* 915 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. 916 */ 917void 918t3_set_keepalive(struct toepcb *toep, int on_off) 919{ 920 921 set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off); 922} 923 924void 925t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off) 926{ 927 set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off); 928} 929 930void 931t3_set_dack_mss(struct toepcb *toep, int on_off) 932{ 933 934 set_tcb_tflag(toep, S_TF_DACK_MSS, on_off); 935} 936 937/* 938 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. 939 */ 940static void 941t3_set_tos(struct toepcb *toep) 942{ 943 int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb); 944 945 t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), 946 V_TCB_TOS(tos)); 947} 948 949 950/* 951 * In DDP mode, TP fails to schedule a timer to push RX data to the host when 952 * DDP is disabled (data is delivered to freelist). [Note that, the peer should 953 * set the PSH bit in the last segment, which would trigger delivery.] 954 * We work around the issue by setting a DDP buffer in a partial placed state, 955 * which guarantees that TP will schedule a timer. 956 */ 957#define TP_DDP_TIMER_WORKAROUND_MASK\ 958 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ 959 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ 960 V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) 961#define TP_DDP_TIMER_WORKAROUND_VAL\ 962 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ 963 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\ 964 32)) 965 966static void 967t3_enable_ddp(struct toepcb *toep, int on) 968{ 969 if (on) { 970 971 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), 972 V_TF_DDP_OFF(0)); 973 } else 974 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, 975 V_TF_DDP_OFF(1) | 976 TP_DDP_TIMER_WORKAROUND_MASK, 977 V_TF_DDP_OFF(1) | 978 TP_DDP_TIMER_WORKAROUND_VAL); 979 980} 981 982void 983t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color) 984{ 985 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx, 986 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 987 tag_color); 988} 989 990void 991t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset, 992 unsigned int len) 993{ 994 if (buf_idx == 0) 995 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET, 996 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 997 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 998 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) | 999 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 1000 else 1001 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET, 1002 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 1003 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), 1004 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) | 1005 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32)); 1006} 1007 1008static int 1009t3_set_cong_control(struct socket *so, const char *name) 1010{ 1011#ifdef CONGESTION_CONTROL_SUPPORTED 1012 int cong_algo; 1013 1014 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) 1015 if (!strcmp(name, t3_cong_ops[cong_algo].name)) 1016 break; 1017 1018 if (cong_algo >= ARRAY_SIZE(t3_cong_ops)) 1019 return -EINVAL; 1020#endif 1021 return 0; 1022} 1023 1024int 1025t3_get_tcb(struct toepcb *toep) 1026{ 1027 struct cpl_get_tcb *req; 1028 struct tcpcb *tp = toep->tp_tp; 1029 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); 1030 1031 if (!m) 1032 return (ENOMEM); 1033 1034 inp_lock_assert(tp->t_inpcb); 1035 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 1036 req = mtod(m, struct cpl_get_tcb *); 1037 m->m_pkthdr.len = m->m_len = sizeof(*req); 1038 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1039 req->wr.wr_lo = 0; 1040 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid)); 1041 req->cpuno = htons(toep->tp_qset); 1042 req->rsvd = 0; 1043 if (tp->t_state == TCPS_SYN_SENT) 1044 mbufq_tail(&toep->out_of_order_queue, m); // defer 1045 else 1046 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 1047 return 0; 1048} 1049 1050static inline void 1051so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid) 1052{ 1053 1054 toepcb_hold(toep); 1055 1056 cxgb_insert_tid(d->cdev, d->client, toep, tid); 1057} 1058 1059/** 1060 * find_best_mtu - find the entry in the MTU table closest to an MTU 1061 * @d: TOM state 1062 * @mtu: the target MTU 1063 * 1064 * Returns the index of the value in the MTU table that is closest to but 1065 * does not exceed the target MTU. 1066 */ 1067static unsigned int 1068find_best_mtu(const struct t3c_data *d, unsigned short mtu) 1069{ 1070 int i = 0; 1071 1072 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) 1073 ++i; 1074 return (i); 1075} 1076 1077static unsigned int 1078select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu) 1079{ 1080 unsigned int idx; 1081 1082#ifdef notyet 1083 struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt; 1084#endif 1085 if (tp) { 1086 tp->t_maxseg = pmtu - 40; 1087 if (tp->t_maxseg < td->mtus[0] - 40) 1088 tp->t_maxseg = td->mtus[0] - 40; 1089 idx = find_best_mtu(td, tp->t_maxseg + 40); 1090 1091 tp->t_maxseg = td->mtus[idx] - 40; 1092 } else 1093 idx = find_best_mtu(td, pmtu); 1094 1095 return (idx); 1096} 1097 1098static inline void 1099free_atid(struct t3cdev *cdev, unsigned int tid) 1100{ 1101 struct toepcb *toep = cxgb_free_atid(cdev, tid); 1102 1103 if (toep) 1104 toepcb_release(toep); 1105} 1106 1107/* 1108 * Release resources held by an offload connection (TID, L2T entry, etc.) 1109 */ 1110static void 1111t3_release_offload_resources(struct toepcb *toep) 1112{ 1113 struct tcpcb *tp = toep->tp_tp; 1114 struct toedev *tdev = toep->tp_toedev; 1115 struct t3cdev *cdev; 1116 struct socket *so; 1117 unsigned int tid = toep->tp_tid; 1118 struct sockbuf *rcv; 1119 1120 CTR0(KTR_TOM, "t3_release_offload_resources"); 1121 1122 if (!tdev) 1123 return; 1124 1125 cdev = TOEP_T3C_DEV(toep); 1126 if (!cdev) 1127 return; 1128 1129 toep->tp_qset = 0; 1130 t3_release_ddp_resources(toep); 1131 1132#ifdef CTRL_SKB_CACHE 1133 kfree_skb(CTRL_SKB_CACHE(tp)); 1134 CTRL_SKB_CACHE(tp) = NULL; 1135#endif 1136 1137 if (toep->tp_wr_avail != toep->tp_wr_max) { 1138 purge_wr_queue(toep); 1139 reset_wr_list(toep); 1140 } 1141 1142 if (toep->tp_l2t) { 1143 l2t_release(L2DATA(cdev), toep->tp_l2t); 1144 toep->tp_l2t = NULL; 1145 } 1146 toep->tp_tp = NULL; 1147 if (tp) { 1148 inp_lock_assert(tp->t_inpcb); 1149 so = inp_inpcbtosocket(tp->t_inpcb); 1150 rcv = so_sockbuf_rcv(so); 1151 /* 1152 * cancel any offloaded reads 1153 * 1154 */ 1155 sockbuf_lock(rcv); 1156 tp->t_toe = NULL; 1157 tp->t_flags &= ~TF_TOE; 1158 if (toep->tp_ddp_state.user_ddp_pending) { 1159 t3_cancel_ubuf(toep, rcv); 1160 toep->tp_ddp_state.user_ddp_pending = 0; 1161 } 1162 so_sorwakeup_locked(so); 1163 1164 } 1165 1166 if (toep->tp_state == TCPS_SYN_SENT) { 1167 free_atid(cdev, tid); 1168#ifdef notyet 1169 __skb_queue_purge(&tp->out_of_order_queue); 1170#endif 1171 } else { // we have TID 1172 cxgb_remove_tid(cdev, toep, tid); 1173 toepcb_release(toep); 1174 } 1175#if 0 1176 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state); 1177#endif 1178} 1179 1180static void 1181install_offload_ops(struct socket *so) 1182{ 1183 struct tcpcb *tp = so_sototcpcb(so); 1184 1185 KASSERT(tp->t_toe != NULL, ("toepcb not set")); 1186 1187 t3_install_socket_ops(so); 1188 tp->t_flags |= TF_TOE; 1189 tp->t_tu = &cxgb_toe_usrreqs; 1190} 1191 1192/* 1193 * Determine the receive window scaling factor given a target max 1194 * receive window. 1195 */ 1196static __inline int 1197select_rcv_wscale(int space) 1198{ 1199 int wscale = 0; 1200 1201 if (space > MAX_RCV_WND) 1202 space = MAX_RCV_WND; 1203 1204 if (tcp_do_rfc1323) 1205 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; 1206 1207 return (wscale); 1208} 1209 1210/* 1211 * Determine the receive window size for a socket. 1212 */ 1213static unsigned long 1214select_rcv_wnd(struct toedev *dev, struct socket *so) 1215{ 1216 struct tom_data *d = TOM_DATA(dev); 1217 unsigned int wnd; 1218 unsigned int max_rcv_wnd; 1219 struct sockbuf *rcv; 1220 1221 rcv = so_sockbuf_rcv(so); 1222 1223 if (tcp_do_autorcvbuf) 1224 wnd = tcp_autorcvbuf_max; 1225 else 1226 wnd = rcv->sb_hiwat; 1227 1228 1229 1230 /* XXX 1231 * For receive coalescing to work effectively we need a receive window 1232 * that can accomodate a coalesced segment. 1233 */ 1234 if (wnd < MIN_RCV_WND) 1235 wnd = MIN_RCV_WND; 1236 1237 /* PR 5138 */ 1238 max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ? 1239 (uint32_t)d->rx_page_size * 23 : 1240 MAX_RCV_WND); 1241 1242 return min(wnd, max_rcv_wnd); 1243} 1244 1245/* 1246 * Assign offload parameters to some socket fields. This code is used by 1247 * both active and passive opens. 1248 */ 1249static inline void 1250init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, 1251 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep) 1252{ 1253 struct tcpcb *tp = so_sototcpcb(so); 1254 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev); 1255 struct sockbuf *snd, *rcv; 1256 1257#ifdef notyet 1258 SOCK_LOCK_ASSERT(so); 1259#endif 1260 1261 snd = so_sockbuf_snd(so); 1262 rcv = so_sockbuf_rcv(so); 1263 1264 log(LOG_INFO, "initializing offload socket\n"); 1265 /* 1266 * We either need to fix push frames to work with sbcompress 1267 * or we need to add this 1268 */ 1269 snd->sb_flags |= SB_NOCOALESCE; 1270 rcv->sb_flags |= SB_NOCOALESCE; 1271 1272 tp->t_toe = toep; 1273 toep->tp_tp = tp; 1274 toep->tp_toedev = dev; 1275 1276 toep->tp_tid = tid; 1277 toep->tp_l2t = e; 1278 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs); 1279 toep->tp_wr_unacked = 0; 1280 toep->tp_delack_mode = 0; 1281 1282 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu); 1283 /* 1284 * XXX broken 1285 * 1286 */ 1287 tp->rcv_wnd = select_rcv_wnd(dev, so); 1288 1289 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) && 1290 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 1291 toep->tp_qset_idx = 0; 1292 1293 reset_wr_list(toep); 1294 DPRINTF("initialization done\n"); 1295} 1296 1297/* 1298 * The next two functions calculate the option 0 value for a socket. 1299 */ 1300static inline unsigned int 1301calc_opt0h(struct socket *so, int mtu_idx) 1302{ 1303 struct tcpcb *tp = so_sototcpcb(so); 1304 int wscale = select_rcv_wscale(tp->rcv_wnd); 1305 1306 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) | 1307 V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS | 1308 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx); 1309} 1310 1311static inline unsigned int 1312calc_opt0l(struct socket *so, int ulp_mode) 1313{ 1314 struct tcpcb *tp = so_sototcpcb(so); 1315 unsigned int val; 1316 1317 val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) | 1318 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ)); 1319 1320 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val); 1321 return (val); 1322} 1323 1324static inline unsigned int 1325calc_opt2(const struct socket *so, struct toedev *dev) 1326{ 1327 int flv_valid; 1328 1329 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1); 1330 1331 return (V_FLAVORS_VALID(flv_valid) | 1332 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0)); 1333} 1334 1335#if DEBUG_WR > 1 1336static int 1337count_pending_wrs(const struct toepcb *toep) 1338{ 1339 const struct mbuf *m; 1340 int n = 0; 1341 1342 wr_queue_walk(toep, m) 1343 n += m->m_pkthdr.csum_data; 1344 return (n); 1345} 1346#endif 1347 1348#if 0 1349(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) 1350#endif 1351 1352static void 1353mk_act_open_req(struct socket *so, struct mbuf *m, 1354 unsigned int atid, const struct l2t_entry *e) 1355{ 1356 struct cpl_act_open_req *req; 1357 struct inpcb *inp = so_sotoinpcb(so); 1358 struct tcpcb *tp = inp_inpcbtotcpcb(inp); 1359 struct toepcb *toep = tp->t_toe; 1360 struct toedev *tdev = toep->tp_toedev; 1361 1362 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep)); 1363 1364 req = mtod(m, struct cpl_act_open_req *); 1365 m->m_pkthdr.len = m->m_len = sizeof(*req); 1366 1367 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1368 req->wr.wr_lo = 0; 1369 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 1370 inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port); 1371#if 0 1372 req->local_port = inp->inp_lport; 1373 req->peer_port = inp->inp_fport; 1374 memcpy(&req->local_ip, &inp->inp_laddr, 4); 1375 memcpy(&req->peer_ip, &inp->inp_faddr, 4); 1376#endif 1377 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) | 1378 V_TX_CHANNEL(e->smt_idx)); 1379 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode)); 1380 req->params = 0; 1381 req->opt2 = htonl(calc_opt2(so, tdev)); 1382} 1383 1384 1385/* 1386 * Convert an ACT_OPEN_RPL status to an errno. 1387 */ 1388static int 1389act_open_rpl_status_to_errno(int status) 1390{ 1391 switch (status) { 1392 case CPL_ERR_CONN_RESET: 1393 return (ECONNREFUSED); 1394 case CPL_ERR_ARP_MISS: 1395 return (EHOSTUNREACH); 1396 case CPL_ERR_CONN_TIMEDOUT: 1397 return (ETIMEDOUT); 1398 case CPL_ERR_TCAM_FULL: 1399 return (ENOMEM); 1400 case CPL_ERR_CONN_EXIST: 1401 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); 1402 return (EADDRINUSE); 1403 default: 1404 return (EIO); 1405 } 1406} 1407 1408static void 1409fail_act_open(struct toepcb *toep, int errno) 1410{ 1411 struct tcpcb *tp = toep->tp_tp; 1412 1413 t3_release_offload_resources(toep); 1414 if (tp) { 1415 inp_wunlock(tp->t_inpcb); 1416 tcp_offload_drop(tp, errno); 1417 } 1418 1419#ifdef notyet 1420 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 1421#endif 1422} 1423 1424/* 1425 * Handle active open failures. 1426 */ 1427static void 1428active_open_failed(struct toepcb *toep, struct mbuf *m) 1429{ 1430 struct cpl_act_open_rpl *rpl = cplhdr(m); 1431 struct inpcb *inp; 1432 1433 if (toep->tp_tp == NULL) 1434 goto done; 1435 1436 inp = toep->tp_tp->t_inpcb; 1437 1438/* 1439 * Don't handle connection retry for now 1440 */ 1441#ifdef notyet 1442 struct inet_connection_sock *icsk = inet_csk(sk); 1443 1444 if (rpl->status == CPL_ERR_CONN_EXIST && 1445 icsk->icsk_retransmit_timer.function != act_open_retry_timer) { 1446 icsk->icsk_retransmit_timer.function = act_open_retry_timer; 1447 sk_reset_timer(so, &icsk->icsk_retransmit_timer, 1448 jiffies + HZ / 2); 1449 } else 1450#endif 1451 { 1452 inp_wlock(inp); 1453 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status)); 1454 } 1455 1456 done: 1457 m_free(m); 1458} 1459 1460/* 1461 * Return whether a failed active open has allocated a TID 1462 */ 1463static inline int 1464act_open_has_tid(int status) 1465{ 1466 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && 1467 status != CPL_ERR_ARP_MISS; 1468} 1469 1470/* 1471 * Process an ACT_OPEN_RPL CPL message. 1472 */ 1473static int 1474do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1475{ 1476 struct toepcb *toep = (struct toepcb *)ctx; 1477 struct cpl_act_open_rpl *rpl = cplhdr(m); 1478 1479 if (cdev->type != T3A && act_open_has_tid(rpl->status)) 1480 cxgb_queue_tid_release(cdev, GET_TID(rpl)); 1481 1482 active_open_failed(toep, m); 1483 return (0); 1484} 1485 1486/* 1487 * Handle an ARP failure for an active open. XXX purge ofo queue 1488 * 1489 * XXX badly broken for crossed SYNs as the ATID is no longer valid. 1490 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should 1491 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't 1492 * free the atid. Hmm. 1493 */ 1494#ifdef notyet 1495static void 1496act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m) 1497{ 1498 struct toepcb *toep = m_get_toep(m); 1499 struct tcpcb *tp = toep->tp_tp; 1500 struct inpcb *inp = tp->t_inpcb; 1501 struct socket *so; 1502 1503 inp_wlock(inp); 1504 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) { 1505 fail_act_open(so, EHOSTUNREACH); 1506 printf("freeing %p\n", m); 1507 1508 m_free(m); 1509 } else 1510 inp_wunlock(inp); 1511} 1512#endif 1513/* 1514 * Send an active open request. 1515 */ 1516int 1517t3_connect(struct toedev *tdev, struct socket *so, 1518 struct rtentry *rt, struct sockaddr *nam) 1519{ 1520 struct mbuf *m; 1521 struct l2t_entry *e; 1522 struct tom_data *d = TOM_DATA(tdev); 1523 struct inpcb *inp = so_sotoinpcb(so); 1524 struct tcpcb *tp = intotcpcb(inp); 1525 struct toepcb *toep; /* allocated by init_offload_socket */ 1526 1527 int atid; 1528 1529 toep = toepcb_alloc(); 1530 if (toep == NULL) 1531 goto out_err; 1532 1533 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0) 1534 goto out_err; 1535 1536 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam); 1537 if (!e) 1538 goto free_tid; 1539 1540 inp_lock_assert(inp); 1541 m = m_gethdr(MT_DATA, M_WAITOK); 1542 1543#if 0 1544 m->m_toe.mt_toepcb = tp->t_toe; 1545 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure); 1546#endif 1547 so_lock(so); 1548 1549 init_offload_socket(so, tdev, atid, e, rt, toep); 1550 1551 install_offload_ops(so); 1552 1553 mk_act_open_req(so, m, atid, e); 1554 so_unlock(so); 1555 1556 soisconnecting(so); 1557 toep = tp->t_toe; 1558 m_set_toep(m, tp->t_toe); 1559 1560 toep->tp_state = TCPS_SYN_SENT; 1561 l2t_send(d->cdev, (struct mbuf *)m, e); 1562 1563 if (toep->tp_ulp_mode) 1564 t3_enable_ddp(toep, 0); 1565 return (0); 1566 1567free_tid: 1568 printf("failing connect - free atid\n"); 1569 1570 free_atid(d->cdev, atid); 1571out_err: 1572 printf("return ENOMEM\n"); 1573 return (ENOMEM); 1574} 1575 1576/* 1577 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do 1578 * not send multiple ABORT_REQs for the same connection and also that we do 1579 * not try to send a message after the connection has closed. Returns 1 if 1580 * an ABORT_REQ wasn't generated after all, 0 otherwise. 1581 */ 1582static void 1583t3_send_reset(struct toepcb *toep) 1584{ 1585 1586 struct cpl_abort_req *req; 1587 unsigned int tid = toep->tp_tid; 1588 int mode = CPL_ABORT_SEND_RST; 1589 struct tcpcb *tp = toep->tp_tp; 1590 struct toedev *tdev = toep->tp_toedev; 1591 struct socket *so = NULL; 1592 struct mbuf *m; 1593 struct sockbuf *snd; 1594 1595 if (tp) { 1596 inp_lock_assert(tp->t_inpcb); 1597 so = inp_inpcbtosocket(tp->t_inpcb); 1598 } 1599 1600 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) || 1601 tdev == NULL)) 1602 return; 1603 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN); 1604 1605 snd = so_sockbuf_snd(so); 1606 /* Purge the send queue so we don't send anything after an abort. */ 1607 if (so) 1608 sbflush(snd); 1609 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev)) 1610 mode |= CPL_ABORT_POST_CLOSE_REQ; 1611 1612 m = m_gethdr_nofail(sizeof(*req)); 1613 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep)); 1614 set_arp_failure_handler(m, abort_arp_failure); 1615 1616 req = mtod(m, struct cpl_abort_req *); 1617 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1618 req->wr.wr_lo = htonl(V_WR_TID(tid)); 1619 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1620 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0; 1621 req->rsvd1 = !(toep->tp_flags & TP_DATASENT); 1622 req->cmd = mode; 1623 if (tp && (tp->t_state == TCPS_SYN_SENT)) 1624 mbufq_tail(&toep->out_of_order_queue, m); // defer 1625 else 1626 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); 1627} 1628 1629static int 1630t3_ip_ctloutput(struct socket *so, struct sockopt *sopt) 1631{ 1632 struct inpcb *inp; 1633 int error, optval; 1634 1635 if (sopt->sopt_name == IP_OPTIONS) 1636 return (ENOPROTOOPT); 1637 1638 if (sopt->sopt_name != IP_TOS) 1639 return (EOPNOTSUPP); 1640 1641 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 1642 1643 if (error) 1644 return (error); 1645 1646 if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread)) 1647 return (EPERM); 1648 1649 inp = so_sotoinpcb(so); 1650 inp_ip_tos_set(inp, optval); 1651#if 0 1652 inp->inp_ip_tos = optval; 1653#endif 1654 t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe); 1655 1656 return (0); 1657} 1658 1659static int 1660t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt) 1661{ 1662 int err = 0; 1663 size_t copied; 1664 1665 if (sopt->sopt_name != TCP_CONGESTION && 1666 sopt->sopt_name != TCP_NODELAY) 1667 return (EOPNOTSUPP); 1668 1669 if (sopt->sopt_name == TCP_CONGESTION) { 1670 char name[TCP_CA_NAME_MAX]; 1671 int optlen = sopt->sopt_valsize; 1672 struct tcpcb *tp; 1673 1674 if (optlen < 1) 1675 return (EINVAL); 1676 1677 err = copyinstr(sopt->sopt_val, name, 1678 min(TCP_CA_NAME_MAX - 1, optlen), &copied); 1679 if (err) 1680 return (err); 1681 if (copied < 1) 1682 return (EINVAL); 1683 1684 tp = so_sototcpcb(so); 1685 /* 1686 * XXX I need to revisit this 1687 */ 1688 if ((err = t3_set_cong_control(so, name)) == 0) { 1689#ifdef CONGESTION_CONTROL_SUPPORTED 1690 tp->t_cong_control = strdup(name, M_CXGB); 1691#endif 1692 } else 1693 return (err); 1694 } else { 1695 int optval, oldval; 1696 struct inpcb *inp; 1697 struct tcpcb *tp; 1698 1699 err = sooptcopyin(sopt, &optval, sizeof optval, 1700 sizeof optval); 1701 1702 if (err) 1703 return (err); 1704 1705 inp = so_sotoinpcb(so); 1706 tp = inp_inpcbtotcpcb(inp); 1707 1708 inp_wlock(inp); 1709 1710 oldval = tp->t_flags; 1711 if (optval) 1712 tp->t_flags |= TF_NODELAY; 1713 else 1714 tp->t_flags &= ~TF_NODELAY; 1715 inp_wunlock(inp); 1716 1717 1718 if (oldval != tp->t_flags) 1719 t3_set_nagle(tp->t_toe); 1720 1721 } 1722 1723 return (0); 1724} 1725 1726int 1727t3_ctloutput(struct socket *so, struct sockopt *sopt) 1728{ 1729 int err; 1730 1731 if (sopt->sopt_level != IPPROTO_TCP) 1732 err = t3_ip_ctloutput(so, sopt); 1733 else 1734 err = t3_tcp_ctloutput(so, sopt); 1735 1736 if (err != EOPNOTSUPP) 1737 return (err); 1738 1739 return (tcp_ctloutput(so, sopt)); 1740} 1741 1742/* 1743 * Returns true if we need to explicitly request RST when we receive new data 1744 * on an RX-closed connection. 1745 */ 1746static inline int 1747need_rst_on_excess_rx(const struct toepcb *toep) 1748{ 1749 return (1); 1750} 1751 1752/* 1753 * Handles Rx data that arrives in a state where the socket isn't accepting 1754 * new data. 1755 */ 1756static void 1757handle_excess_rx(struct toepcb *toep, struct mbuf *m) 1758{ 1759 1760 if (need_rst_on_excess_rx(toep) && 1761 !(toep->tp_flags & TP_ABORT_SHUTDOWN)) 1762 t3_send_reset(toep); 1763 m_freem(m); 1764} 1765 1766/* 1767 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE) 1768 * by getting the DDP offset from the TCB. 1769 */ 1770static void 1771tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m) 1772{ 1773 struct ddp_state *q = &toep->tp_ddp_state; 1774 struct ddp_buf_state *bsp; 1775 struct cpl_get_tcb_rpl *hdr; 1776 unsigned int ddp_offset; 1777 struct socket *so; 1778 struct tcpcb *tp; 1779 struct sockbuf *rcv; 1780 int state; 1781 1782 uint64_t t; 1783 __be64 *tcb; 1784 1785 tp = toep->tp_tp; 1786 so = inp_inpcbtosocket(tp->t_inpcb); 1787 1788 inp_lock_assert(tp->t_inpcb); 1789 rcv = so_sockbuf_rcv(so); 1790 sockbuf_lock(rcv); 1791 1792 /* Note that we only accout for CPL_GET_TCB issued by the DDP code. 1793 * We really need a cookie in order to dispatch the RPLs. 1794 */ 1795 q->get_tcb_count--; 1796 1797 /* It is a possible that a previous CPL already invalidated UBUF DDP 1798 * and moved the cur_buf idx and hence no further processing of this 1799 * skb is required. However, the app might be sleeping on 1800 * !q->get_tcb_count and we need to wake it up. 1801 */ 1802 if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) { 1803 int state = so_state_get(so); 1804 1805 m_freem(m); 1806 if (__predict_true((state & SS_NOFDREF) == 0)) 1807 so_sorwakeup_locked(so); 1808 else 1809 sockbuf_unlock(rcv); 1810 1811 return; 1812 } 1813 1814 bsp = &q->buf_state[q->cur_buf]; 1815 hdr = cplhdr(m); 1816 tcb = (__be64 *)(hdr + 1); 1817 if (q->cur_buf == 0) { 1818 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]); 1819 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET); 1820 } else { 1821 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]); 1822 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET; 1823 } 1824 ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET; 1825 m->m_cur_offset = bsp->cur_offset; 1826 bsp->cur_offset = ddp_offset; 1827 m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset; 1828 1829 CTR5(KTR_TOM, 1830 "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u", 1831 q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset); 1832 KASSERT(ddp_offset >= m->m_cur_offset, 1833 ("ddp_offset=%u less than cur_offset=%u", 1834 ddp_offset, m->m_cur_offset)); 1835 1836#if 0 1837{ 1838 unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx; 1839 1840 t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]); 1841 ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS; 1842 1843 t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]); 1844 rcv_nxt = t >> S_TCB_RCV_NXT; 1845 rcv_nxt &= M_TCB_RCV_NXT; 1846 1847 t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]); 1848 rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET); 1849 rx_hdr_offset &= M_TCB_RX_HDR_OFFSET; 1850 1851 T3_TRACE2(TIDTB(sk), 1852 "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x", 1853 ddp_flags, rcv_nxt - rx_hdr_offset); 1854 T3_TRACE4(TB(q), 1855 "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u", 1856 tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf); 1857 T3_TRACE3(TB(q), 1858 "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u", 1859 rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset); 1860 T3_TRACE2(TB(q), 1861 "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x", 1862 q->buf_state[0].flags, q->buf_state[1].flags); 1863 1864} 1865#endif 1866 if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) { 1867 handle_excess_rx(toep, m); 1868 return; 1869 } 1870 1871#ifdef T3_TRACE 1872 if ((int)m->m_pkthdr.len < 0) { 1873 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len"); 1874 } 1875#endif 1876 if (bsp->flags & DDP_BF_NOCOPY) { 1877#ifdef T3_TRACE 1878 T3_TRACE0(TB(q), 1879 "tcb_rpl_as_ddp_complete: CANCEL UBUF"); 1880 1881 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) { 1882 printk("!cancel_ubuf"); 1883 t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf"); 1884 } 1885#endif 1886 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1; 1887 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA); 1888 q->cur_buf ^= 1; 1889 } else if (bsp->flags & DDP_BF_NOFLIP) { 1890 1891 m->m_ddp_flags = 1; /* always a kernel buffer */ 1892 1893 /* now HW buffer carries a user buffer */ 1894 bsp->flags &= ~DDP_BF_NOFLIP; 1895 bsp->flags |= DDP_BF_NOCOPY; 1896 1897 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate 1898 * any new data in which case we're done. If in addition the 1899 * offset is 0, then there wasn't a completion for the kbuf 1900 * and we need to decrement the posted count. 1901 */ 1902 if (m->m_pkthdr.len == 0) { 1903 if (ddp_offset == 0) { 1904 q->kbuf_posted--; 1905 bsp->flags |= DDP_BF_NODATA; 1906 } 1907 sockbuf_unlock(rcv); 1908 m_free(m); 1909 return; 1910 } 1911 } else { 1912 sockbuf_unlock(rcv); 1913 1914 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP, 1915 * but it got here way late and nobody cares anymore. 1916 */ 1917 m_free(m); 1918 return; 1919 } 1920 1921 m->m_ddp_gl = (unsigned char *)bsp->gl; 1922 m->m_flags |= M_DDP; 1923 m->m_seq = tp->rcv_nxt; 1924 tp->rcv_nxt += m->m_pkthdr.len; 1925 tp->t_rcvtime = ticks; 1926 CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u", 1927 m->m_seq, q->cur_buf, m->m_pkthdr.len); 1928 if (m->m_pkthdr.len == 0) { 1929 q->user_ddp_pending = 0; 1930 m_free(m); 1931 } else 1932 SBAPPEND(rcv, m); 1933 1934 state = so_state_get(so); 1935 if (__predict_true((state & SS_NOFDREF) == 0)) 1936 so_sorwakeup_locked(so); 1937 else 1938 sockbuf_unlock(rcv); 1939} 1940 1941/* 1942 * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code, 1943 * in that case they are similar to DDP completions. 1944 */ 1945static int 1946do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1947{ 1948 struct toepcb *toep = (struct toepcb *)ctx; 1949 1950 /* OK if socket doesn't exist */ 1951 if (toep == NULL) { 1952 printf("null toep in do_get_tcb_rpl\n"); 1953 return (CPL_RET_BUF_DONE); 1954 } 1955 1956 inp_wlock(toep->tp_tp->t_inpcb); 1957 tcb_rpl_as_ddp_complete(toep, m); 1958 inp_wunlock(toep->tp_tp->t_inpcb); 1959 1960 return (0); 1961} 1962 1963static void 1964handle_ddp_data(struct toepcb *toep, struct mbuf *m) 1965{ 1966 struct tcpcb *tp = toep->tp_tp; 1967 struct socket *so; 1968 struct ddp_state *q; 1969 struct ddp_buf_state *bsp; 1970 struct cpl_rx_data *hdr = cplhdr(m); 1971 unsigned int rcv_nxt = ntohl(hdr->seq); 1972 struct sockbuf *rcv; 1973 1974 if (tp->rcv_nxt == rcv_nxt) 1975 return; 1976 1977 inp_lock_assert(tp->t_inpcb); 1978 so = inp_inpcbtosocket(tp->t_inpcb); 1979 rcv = so_sockbuf_rcv(so); 1980 sockbuf_lock(rcv); 1981 1982 q = &toep->tp_ddp_state; 1983 bsp = &q->buf_state[q->cur_buf]; 1984 KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x", 1985 rcv_nxt, tp->rcv_nxt)); 1986 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; 1987 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 1988 CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d", 1989 rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len); 1990 1991#ifdef T3_TRACE 1992 if ((int)m->m_pkthdr.len < 0) { 1993 t3_ddp_error(so, "handle_ddp_data: neg len"); 1994 } 1995#endif 1996 m->m_ddp_gl = (unsigned char *)bsp->gl; 1997 m->m_flags |= M_DDP; 1998 m->m_cur_offset = bsp->cur_offset; 1999 m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; 2000 if (bsp->flags & DDP_BF_NOCOPY) 2001 bsp->flags &= ~DDP_BF_NOCOPY; 2002 2003 m->m_seq = tp->rcv_nxt; 2004 tp->rcv_nxt = rcv_nxt; 2005 bsp->cur_offset += m->m_pkthdr.len; 2006 if (!(bsp->flags & DDP_BF_NOFLIP)) 2007 q->cur_buf ^= 1; 2008 /* 2009 * For now, don't re-enable DDP after a connection fell out of DDP 2010 * mode. 2011 */ 2012 q->ubuf_ddp_ready = 0; 2013 sockbuf_unlock(rcv); 2014} 2015 2016/* 2017 * Process new data received for a connection. 2018 */ 2019static void 2020new_rx_data(struct toepcb *toep, struct mbuf *m) 2021{ 2022 struct cpl_rx_data *hdr = cplhdr(m); 2023 struct tcpcb *tp = toep->tp_tp; 2024 struct socket *so; 2025 struct sockbuf *rcv; 2026 int state; 2027 int len = be16toh(hdr->len); 2028 2029 inp_wlock(tp->t_inpcb); 2030 2031 so = inp_inpcbtosocket(tp->t_inpcb); 2032 2033 if (__predict_false(so_no_receive(so))) { 2034 handle_excess_rx(toep, m); 2035 inp_wunlock(tp->t_inpcb); 2036 TRACE_EXIT; 2037 return; 2038 } 2039 2040 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) 2041 handle_ddp_data(toep, m); 2042 2043 m->m_seq = ntohl(hdr->seq); 2044 m->m_ulp_mode = 0; /* for iSCSI */ 2045 2046#if VALIDATE_SEQ 2047 if (__predict_false(m->m_seq != tp->rcv_nxt)) { 2048 log(LOG_ERR, 2049 "%s: TID %u: Bad sequence number %u, expected %u\n", 2050 toep->tp_toedev->name, toep->tp_tid, m->m_seq, 2051 tp->rcv_nxt); 2052 m_freem(m); 2053 inp_wunlock(tp->t_inpcb); 2054 return; 2055 } 2056#endif 2057 m_adj(m, sizeof(*hdr)); 2058 2059#ifdef URGENT_DATA_SUPPORTED 2060 /* 2061 * We don't handle urgent data yet 2062 */ 2063 if (__predict_false(hdr->urg)) 2064 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg)); 2065 if (__predict_false(tp->urg_data == TCP_URG_NOTYET && 2066 tp->urg_seq - tp->rcv_nxt < skb->len)) 2067 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - 2068 tp->rcv_nxt]; 2069#endif 2070 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) { 2071 toep->tp_delack_mode = hdr->dack_mode; 2072 toep->tp_delack_seq = tp->rcv_nxt; 2073 } 2074 CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d", 2075 m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes); 2076 2077 if (len < m->m_pkthdr.len) 2078 m->m_pkthdr.len = m->m_len = len; 2079 2080 tp->rcv_nxt += m->m_pkthdr.len; 2081 tp->t_rcvtime = ticks; 2082 toep->tp_enqueued_bytes += m->m_pkthdr.len; 2083 CTR2(KTR_TOM, 2084 "new_rx_data: seq 0x%x len %u", 2085 m->m_seq, m->m_pkthdr.len); 2086 inp_wunlock(tp->t_inpcb); 2087 rcv = so_sockbuf_rcv(so); 2088 sockbuf_lock(rcv); 2089#if 0 2090 if (sb_notify(rcv)) 2091 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len); 2092#endif 2093 SBAPPEND(rcv, m); 2094 2095#ifdef notyet 2096 /* 2097 * We're giving too many credits to the card - but disable this check so we can keep on moving :-| 2098 * 2099 */ 2100 KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1), 2101 2102 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d", 2103 so, rcv->sb_cc, rcv->sb_mbmax)); 2104#endif 2105 2106 2107 CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d", 2108 rcv->sb_cc, rcv->sb_mbcnt); 2109 2110 state = so_state_get(so); 2111 if (__predict_true((state & SS_NOFDREF) == 0)) 2112 so_sorwakeup_locked(so); 2113 else 2114 sockbuf_unlock(rcv); 2115} 2116 2117/* 2118 * Handler for RX_DATA CPL messages. 2119 */ 2120static int 2121do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2122{ 2123 struct toepcb *toep = (struct toepcb *)ctx; 2124 2125 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len); 2126 2127 new_rx_data(toep, m); 2128 2129 return (0); 2130} 2131 2132static void 2133new_rx_data_ddp(struct toepcb *toep, struct mbuf *m) 2134{ 2135 struct tcpcb *tp; 2136 struct ddp_state *q; 2137 struct ddp_buf_state *bsp; 2138 struct cpl_rx_data_ddp *hdr; 2139 struct socket *so; 2140 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; 2141 int nomoredata = 0; 2142 unsigned int delack_mode; 2143 struct sockbuf *rcv; 2144 2145 tp = toep->tp_tp; 2146 inp_wlock(tp->t_inpcb); 2147 so = inp_inpcbtosocket(tp->t_inpcb); 2148 2149 if (__predict_false(so_no_receive(so))) { 2150 2151 handle_excess_rx(toep, m); 2152 inp_wunlock(tp->t_inpcb); 2153 return; 2154 } 2155 2156 q = &toep->tp_ddp_state; 2157 hdr = cplhdr(m); 2158 ddp_report = ntohl(hdr->u.ddp_report); 2159 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 2160 bsp = &q->buf_state[buf_idx]; 2161 2162 CTR4(KTR_TOM, 2163 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " 2164 "hdr seq 0x%x len %u", 2165 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), 2166 ntohs(hdr->len)); 2167 CTR3(KTR_TOM, 2168 "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d", 2169 G_DDP_OFFSET(ddp_report), ddp_report, buf_idx); 2170 2171 ddp_len = ntohs(hdr->len); 2172 rcv_nxt = ntohl(hdr->seq) + ddp_len; 2173 2174 delack_mode = G_DDP_DACK_MODE(ddp_report); 2175 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { 2176 toep->tp_delack_mode = delack_mode; 2177 toep->tp_delack_seq = tp->rcv_nxt; 2178 } 2179 2180 m->m_seq = tp->rcv_nxt; 2181 tp->rcv_nxt = rcv_nxt; 2182 2183 tp->t_rcvtime = ticks; 2184 /* 2185 * Store the length in m->m_len. We are changing the meaning of 2186 * m->m_len here, we need to be very careful that nothing from now on 2187 * interprets ->len of this packet the usual way. 2188 */ 2189 m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq; 2190 inp_wunlock(tp->t_inpcb); 2191 CTR3(KTR_TOM, 2192 "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ", 2193 m->m_len, rcv_nxt, m->m_seq); 2194 /* 2195 * Figure out where the new data was placed in the buffer and store it 2196 * in when. Assumes the buffer offset starts at 0, consumer needs to 2197 * account for page pod's pg_offset. 2198 */ 2199 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; 2200 m->m_cur_offset = end_offset - m->m_pkthdr.len; 2201 2202 rcv = so_sockbuf_rcv(so); 2203 sockbuf_lock(rcv); 2204 2205 m->m_ddp_gl = (unsigned char *)bsp->gl; 2206 m->m_flags |= M_DDP; 2207 bsp->cur_offset = end_offset; 2208 toep->tp_enqueued_bytes += m->m_pkthdr.len; 2209 2210 /* 2211 * Length is only meaningful for kbuf 2212 */ 2213 if (!(bsp->flags & DDP_BF_NOCOPY)) 2214 KASSERT(m->m_len <= bsp->gl->dgl_length, 2215 ("length received exceeds ddp pages: len=%d dgl_length=%d", 2216 m->m_len, bsp->gl->dgl_length)); 2217 2218 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2219 KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next)); 2220 /* 2221 * Bit 0 of flags stores whether the DDP buffer is completed. 2222 * Note that other parts of the code depend on this being in bit 0. 2223 */ 2224 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { 2225 panic("spurious ddp completion"); 2226 } else { 2227 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); 2228 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP)) 2229 q->cur_buf ^= 1; /* flip buffers */ 2230 } 2231 2232 if (bsp->flags & DDP_BF_NOCOPY) { 2233 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY); 2234 bsp->flags &= ~DDP_BF_NOCOPY; 2235 } 2236 2237 if (ddp_report & F_DDP_PSH) 2238 m->m_ddp_flags |= DDP_BF_PSH; 2239 if (nomoredata) 2240 m->m_ddp_flags |= DDP_BF_NODATA; 2241 2242#ifdef notyet 2243 skb_reset_transport_header(skb); 2244 tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */ 2245#endif 2246 SBAPPEND(rcv, m); 2247 2248 if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) || 2249 (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1)) 2250 || !(m->m_ddp_flags & DDP_BF_NOCOPY)))) 2251 so_sorwakeup_locked(so); 2252 else 2253 sockbuf_unlock(rcv); 2254} 2255 2256#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 2257 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 2258 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 2259 F_DDP_INVALID_PPOD) 2260 2261/* 2262 * Handler for RX_DATA_DDP CPL messages. 2263 */ 2264static int 2265do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2266{ 2267 struct toepcb *toep = ctx; 2268 const struct cpl_rx_data_ddp *hdr = cplhdr(m); 2269 2270 VALIDATE_SOCK(so); 2271 2272 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { 2273 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", 2274 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); 2275 return (CPL_RET_BUF_DONE); 2276 } 2277#if 0 2278 skb->h.th = tcphdr_skb->h.th; 2279#endif 2280 new_rx_data_ddp(toep, m); 2281 return (0); 2282} 2283 2284static void 2285process_ddp_complete(struct toepcb *toep, struct mbuf *m) 2286{ 2287 struct tcpcb *tp = toep->tp_tp; 2288 struct socket *so; 2289 struct ddp_state *q; 2290 struct ddp_buf_state *bsp; 2291 struct cpl_rx_ddp_complete *hdr; 2292 unsigned int ddp_report, buf_idx, when, delack_mode; 2293 int nomoredata = 0; 2294 struct sockbuf *rcv; 2295 2296 inp_wlock(tp->t_inpcb); 2297 so = inp_inpcbtosocket(tp->t_inpcb); 2298 2299 if (__predict_false(so_no_receive(so))) { 2300 struct inpcb *inp = so_sotoinpcb(so); 2301 2302 handle_excess_rx(toep, m); 2303 inp_wunlock(inp); 2304 return; 2305 } 2306 q = &toep->tp_ddp_state; 2307 hdr = cplhdr(m); 2308 ddp_report = ntohl(hdr->ddp_report); 2309 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 2310 m->m_pkthdr.csum_data = tp->rcv_nxt; 2311 2312 rcv = so_sockbuf_rcv(so); 2313 sockbuf_lock(rcv); 2314 2315 bsp = &q->buf_state[buf_idx]; 2316 when = bsp->cur_offset; 2317 m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when; 2318 tp->rcv_nxt += m->m_len; 2319 tp->t_rcvtime = ticks; 2320 2321 delack_mode = G_DDP_DACK_MODE(ddp_report); 2322 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { 2323 toep->tp_delack_mode = delack_mode; 2324 toep->tp_delack_seq = tp->rcv_nxt; 2325 } 2326#ifdef notyet 2327 skb_reset_transport_header(skb); 2328 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ 2329#endif 2330 inp_wunlock(tp->t_inpcb); 2331 2332 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2333 CTR5(KTR_TOM, 2334 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 2335 "ddp_report 0x%x offset %u, len %u", 2336 tp->rcv_nxt, bsp->cur_offset, ddp_report, 2337 G_DDP_OFFSET(ddp_report), m->m_len); 2338 2339 bsp->cur_offset += m->m_len; 2340 2341 if (!(bsp->flags & DDP_BF_NOFLIP)) { 2342 q->cur_buf ^= 1; /* flip buffers */ 2343 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length) 2344 nomoredata=1; 2345 } 2346 2347 CTR4(KTR_TOM, 2348 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 2349 "ddp_report %u offset %u", 2350 tp->rcv_nxt, bsp->cur_offset, ddp_report, 2351 G_DDP_OFFSET(ddp_report)); 2352 2353 m->m_ddp_gl = (unsigned char *)bsp->gl; 2354 m->m_flags |= M_DDP; 2355 m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; 2356 if (bsp->flags & DDP_BF_NOCOPY) 2357 bsp->flags &= ~DDP_BF_NOCOPY; 2358 if (nomoredata) 2359 m->m_ddp_flags |= DDP_BF_NODATA; 2360 2361 SBAPPEND(rcv, m); 2362 if ((so_state_get(so) & SS_NOFDREF) == 0) 2363 so_sorwakeup_locked(so); 2364 else 2365 sockbuf_unlock(rcv); 2366} 2367 2368/* 2369 * Handler for RX_DDP_COMPLETE CPL messages. 2370 */ 2371static int 2372do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2373{ 2374 struct toepcb *toep = ctx; 2375 2376 VALIDATE_SOCK(so); 2377#if 0 2378 skb->h.th = tcphdr_skb->h.th; 2379#endif 2380 process_ddp_complete(toep, m); 2381 return (0); 2382} 2383 2384/* 2385 * Move a socket to TIME_WAIT state. We need to make some adjustments to the 2386 * socket state before calling tcp_time_wait to comply with its expectations. 2387 */ 2388static void 2389enter_timewait(struct tcpcb *tp) 2390{ 2391 /* 2392 * Bump rcv_nxt for the peer FIN. We don't do this at the time we 2393 * process peer_close because we don't want to carry the peer FIN in 2394 * the socket's receive queue and if we increment rcv_nxt without 2395 * having the FIN in the receive queue we'll confuse facilities such 2396 * as SIOCINQ. 2397 */ 2398 inp_wlock(tp->t_inpcb); 2399 tp->rcv_nxt++; 2400 2401 tp->ts_recent_age = 0; /* defeat recycling */ 2402 tp->t_srtt = 0; /* defeat tcp_update_metrics */ 2403 inp_wunlock(tp->t_inpcb); 2404 tcp_offload_twstart(tp); 2405} 2406 2407static void 2408enter_timewait_disconnect(struct tcpcb *tp) 2409{ 2410 /* 2411 * Bump rcv_nxt for the peer FIN. We don't do this at the time we 2412 * process peer_close because we don't want to carry the peer FIN in 2413 * the socket's receive queue and if we increment rcv_nxt without 2414 * having the FIN in the receive queue we'll confuse facilities such 2415 * as SIOCINQ. 2416 */ 2417 inp_wlock(tp->t_inpcb); 2418 tp->rcv_nxt++; 2419 2420 tp->ts_recent_age = 0; /* defeat recycling */ 2421 tp->t_srtt = 0; /* defeat tcp_update_metrics */ 2422 inp_wunlock(tp->t_inpcb); 2423 tcp_offload_twstart_disconnect(tp); 2424} 2425 2426/* 2427 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This 2428 * function deals with the data that may be reported along with the FIN. 2429 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to 2430 * perform normal FIN-related processing. In the latter case 1 indicates that 2431 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the 2432 * skb can be freed. 2433 */ 2434static int 2435handle_peer_close_data(struct socket *so, struct mbuf *m) 2436{ 2437 struct tcpcb *tp = so_sototcpcb(so); 2438 struct toepcb *toep = tp->t_toe; 2439 struct ddp_state *q; 2440 struct ddp_buf_state *bsp; 2441 struct cpl_peer_close *req = cplhdr(m); 2442 unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */ 2443 struct sockbuf *rcv; 2444 2445 if (tp->rcv_nxt == rcv_nxt) /* no data */ 2446 return (0); 2447 2448 CTR0(KTR_TOM, "handle_peer_close_data"); 2449 if (__predict_false(so_no_receive(so))) { 2450 handle_excess_rx(toep, m); 2451 2452 /* 2453 * Although we discard the data we want to process the FIN so 2454 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP + 2455 * PEER_CLOSE without data. In particular this PEER_CLOSE 2456 * may be what will close the connection. We return 1 because 2457 * handle_excess_rx() already freed the packet. 2458 */ 2459 return (1); 2460 } 2461 2462 inp_lock_assert(tp->t_inpcb); 2463 q = &toep->tp_ddp_state; 2464 rcv = so_sockbuf_rcv(so); 2465 sockbuf_lock(rcv); 2466 2467 bsp = &q->buf_state[q->cur_buf]; 2468 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; 2469 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2470 m->m_ddp_gl = (unsigned char *)bsp->gl; 2471 m->m_flags |= M_DDP; 2472 m->m_cur_offset = bsp->cur_offset; 2473 m->m_ddp_flags = 2474 DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; 2475 m->m_seq = tp->rcv_nxt; 2476 tp->rcv_nxt = rcv_nxt; 2477 bsp->cur_offset += m->m_pkthdr.len; 2478 if (!(bsp->flags & DDP_BF_NOFLIP)) 2479 q->cur_buf ^= 1; 2480#ifdef notyet 2481 skb_reset_transport_header(skb); 2482 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ 2483#endif 2484 tp->t_rcvtime = ticks; 2485 SBAPPEND(rcv, m); 2486 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) 2487 so_sorwakeup_locked(so); 2488 else 2489 sockbuf_unlock(rcv); 2490 2491 return (1); 2492} 2493 2494/* 2495 * Handle a peer FIN. 2496 */ 2497static void 2498do_peer_fin(struct toepcb *toep, struct mbuf *m) 2499{ 2500 struct socket *so; 2501 struct tcpcb *tp = toep->tp_tp; 2502 int keep, action; 2503 2504 action = keep = 0; 2505 CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state); 2506 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 2507 printf("abort_pending set\n"); 2508 2509 goto out; 2510 } 2511 inp_wlock(tp->t_inpcb); 2512 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 2513 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) { 2514 keep = handle_peer_close_data(so, m); 2515 if (keep < 0) { 2516 inp_wunlock(tp->t_inpcb); 2517 return; 2518 } 2519 } 2520 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2521 socantrcvmore(so); 2522 /* 2523 * If connection is half-synchronized 2524 * (ie NEEDSYN flag on) then delay ACK, 2525 * so it may be piggybacked when SYN is sent. 2526 * Otherwise, since we received a FIN then no 2527 * more input can be expected, send ACK now. 2528 */ 2529 if (tp->t_flags & TF_NEEDSYN) 2530 tp->t_flags |= TF_DELACK; 2531 else 2532 tp->t_flags |= TF_ACKNOW; 2533 tp->rcv_nxt++; 2534 } 2535 2536 switch (tp->t_state) { 2537 case TCPS_SYN_RECEIVED: 2538 tp->t_starttime = ticks; 2539 /* FALLTHROUGH */ 2540 case TCPS_ESTABLISHED: 2541 tp->t_state = TCPS_CLOSE_WAIT; 2542 break; 2543 case TCPS_FIN_WAIT_1: 2544 tp->t_state = TCPS_CLOSING; 2545 break; 2546 case TCPS_FIN_WAIT_2: 2547 /* 2548 * If we've sent an abort_req we must have sent it too late, 2549 * HW will send us a reply telling us so, and this peer_close 2550 * is really the last message for this connection and needs to 2551 * be treated as an abort_rpl, i.e., transition the connection 2552 * to TCP_CLOSE (note that the host stack does this at the 2553 * time of generating the RST but we must wait for HW). 2554 * Otherwise we enter TIME_WAIT. 2555 */ 2556 t3_release_offload_resources(toep); 2557 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2558 action = TCP_CLOSE; 2559 } else { 2560 action = TCP_TIMEWAIT; 2561 } 2562 break; 2563 default: 2564 log(LOG_ERR, 2565 "%s: TID %u received PEER_CLOSE in bad state %d\n", 2566 toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state); 2567 } 2568 inp_wunlock(tp->t_inpcb); 2569 2570 DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(rcv), rcv->sb_flags); 2571 2572 2573 if (action == TCP_TIMEWAIT) { 2574 enter_timewait(tp); 2575 } else if (action == TCP_DROP) { 2576 tcp_offload_drop(tp, 0); 2577 } else if (action == TCP_CLOSE) { 2578 tcp_offload_close(tp); 2579 } 2580 2581#ifdef notyet 2582 /* Do not send POLL_HUP for half duplex close. */ 2583 if ((sk->sk_shutdown & SEND_SHUTDOWN) || 2584 sk->sk_state == TCP_CLOSE) 2585 sk_wake_async(so, 1, POLL_HUP); 2586 else 2587 sk_wake_async(so, 1, POLL_IN); 2588#endif 2589 2590out: 2591 if (!keep) 2592 m_free(m); 2593} 2594 2595/* 2596 * Handler for PEER_CLOSE CPL messages. 2597 */ 2598static int 2599do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2600{ 2601 struct toepcb *toep = (struct toepcb *)ctx; 2602 2603 VALIDATE_SOCK(so); 2604 2605 do_peer_fin(toep, m); 2606 return (0); 2607} 2608 2609static void 2610process_close_con_rpl(struct toepcb *toep, struct mbuf *m) 2611{ 2612 struct cpl_close_con_rpl *rpl = cplhdr(m); 2613 struct tcpcb *tp = toep->tp_tp; 2614 struct socket *so; 2615 int action = 0; 2616 struct sockbuf *rcv; 2617 2618 inp_wlock(tp->t_inpcb); 2619 so = inp_inpcbtosocket(tp->t_inpcb); 2620 2621 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ 2622 2623 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 2624 inp_wunlock(tp->t_inpcb); 2625 goto out; 2626 } 2627 2628 CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep, 2629 tp->t_state, !!(so_state_get(so) & SS_NOFDREF)); 2630 2631 switch (tp->t_state) { 2632 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ 2633 t3_release_offload_resources(toep); 2634 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2635 action = TCP_CLOSE; 2636 2637 } else { 2638 action = TCP_TIMEWAIT; 2639 } 2640 break; 2641 case TCPS_LAST_ACK: 2642 /* 2643 * In this state we don't care about pending abort_rpl. 2644 * If we've sent abort_req it was post-close and was sent too 2645 * late, this close_con_rpl is the actual last message. 2646 */ 2647 t3_release_offload_resources(toep); 2648 action = TCP_CLOSE; 2649 break; 2650 case TCPS_FIN_WAIT_1: 2651 /* 2652 * If we can't receive any more 2653 * data, then closing user can proceed. 2654 * Starting the timer is contrary to the 2655 * specification, but if we don't get a FIN 2656 * we'll hang forever. 2657 * 2658 * XXXjl: 2659 * we should release the tp also, and use a 2660 * compressed state. 2661 */ 2662 if (so) 2663 rcv = so_sockbuf_rcv(so); 2664 else 2665 break; 2666 2667 if (rcv->sb_state & SBS_CANTRCVMORE) { 2668 int timeout; 2669 2670 if (so) 2671 soisdisconnected(so); 2672 timeout = (tcp_fast_finwait2_recycle) ? 2673 tcp_finwait2_timeout : tcp_maxidle; 2674 tcp_timer_activate(tp, TT_2MSL, timeout); 2675 } 2676 tp->t_state = TCPS_FIN_WAIT_2; 2677 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 && 2678 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { 2679 action = TCP_DROP; 2680 } 2681 2682 break; 2683 default: 2684 log(LOG_ERR, 2685 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", 2686 toep->tp_toedev->tod_name, toep->tp_tid, 2687 tp->t_state); 2688 } 2689 inp_wunlock(tp->t_inpcb); 2690 2691 2692 if (action == TCP_TIMEWAIT) { 2693 enter_timewait_disconnect(tp); 2694 } else if (action == TCP_DROP) { 2695 tcp_offload_drop(tp, 0); 2696 } else if (action == TCP_CLOSE) { 2697 tcp_offload_close(tp); 2698 } 2699out: 2700 m_freem(m); 2701} 2702 2703/* 2704 * Handler for CLOSE_CON_RPL CPL messages. 2705 */ 2706static int 2707do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m, 2708 void *ctx) 2709{ 2710 struct toepcb *toep = (struct toepcb *)ctx; 2711 2712 process_close_con_rpl(toep, m); 2713 return (0); 2714} 2715 2716/* 2717 * Process abort replies. We only process these messages if we anticipate 2718 * them as the coordination between SW and HW in this area is somewhat lacking 2719 * and sometimes we get ABORT_RPLs after we are done with the connection that 2720 * originated the ABORT_REQ. 2721 */ 2722static void 2723process_abort_rpl(struct toepcb *toep, struct mbuf *m) 2724{ 2725 struct tcpcb *tp = toep->tp_tp; 2726 struct socket *so; 2727 int needclose = 0; 2728 2729#ifdef T3_TRACE 2730 T3_TRACE1(TIDTB(sk), 2731 "process_abort_rpl: GTS rpl pending %d", 2732 sock_flag(sk, ABORT_RPL_PENDING)); 2733#endif 2734 2735 inp_wlock(tp->t_inpcb); 2736 so = inp_inpcbtosocket(tp->t_inpcb); 2737 2738 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2739 /* 2740 * XXX panic on tcpdrop 2741 */ 2742 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev)) 2743 toep->tp_flags |= TP_ABORT_RPL_RCVD; 2744 else { 2745 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING); 2746 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) || 2747 !is_t3a(toep->tp_toedev)) { 2748 if (toep->tp_flags & TP_ABORT_REQ_RCVD) 2749 panic("TP_ABORT_REQ_RCVD set"); 2750 t3_release_offload_resources(toep); 2751 needclose = 1; 2752 } 2753 } 2754 } 2755 inp_wunlock(tp->t_inpcb); 2756 2757 if (needclose) 2758 tcp_offload_close(tp); 2759 2760 m_free(m); 2761} 2762 2763/* 2764 * Handle an ABORT_RPL_RSS CPL message. 2765 */ 2766static int 2767do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2768{ 2769 struct cpl_abort_rpl_rss *rpl = cplhdr(m); 2770 struct toepcb *toep; 2771 2772 /* 2773 * Ignore replies to post-close aborts indicating that the abort was 2774 * requested too late. These connections are terminated when we get 2775 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss 2776 * arrives the TID is either no longer used or it has been recycled. 2777 */ 2778 if (rpl->status == CPL_ERR_ABORT_FAILED) { 2779discard: 2780 m_free(m); 2781 return (0); 2782 } 2783 2784 toep = (struct toepcb *)ctx; 2785 2786 /* 2787 * Sometimes we've already closed the socket, e.g., a post-close 2788 * abort races with ABORT_REQ_RSS, the latter frees the socket 2789 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, 2790 * but FW turns the ABORT_REQ into a regular one and so we get 2791 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. 2792 */ 2793 if (!toep) 2794 goto discard; 2795 2796 if (toep->tp_tp == NULL) { 2797 log(LOG_NOTICE, "removing tid for abort\n"); 2798 cxgb_remove_tid(cdev, toep, toep->tp_tid); 2799 if (toep->tp_l2t) 2800 l2t_release(L2DATA(cdev), toep->tp_l2t); 2801 2802 toepcb_release(toep); 2803 goto discard; 2804 } 2805 2806 log(LOG_NOTICE, "toep=%p\n", toep); 2807 log(LOG_NOTICE, "tp=%p\n", toep->tp_tp); 2808 2809 toepcb_hold(toep); 2810 process_abort_rpl(toep, m); 2811 toepcb_release(toep); 2812 return (0); 2813} 2814 2815/* 2816 * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also 2817 * indicate whether RST should be sent in response. 2818 */ 2819static int 2820abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst) 2821{ 2822 struct tcpcb *tp = so_sototcpcb(so); 2823 2824 switch (abort_reason) { 2825 case CPL_ERR_BAD_SYN: 2826#if 0 2827 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through 2828#endif 2829 case CPL_ERR_CONN_RESET: 2830 // XXX need to handle SYN_RECV due to crossed SYNs 2831 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 2832 case CPL_ERR_XMIT_TIMEDOUT: 2833 case CPL_ERR_PERSIST_TIMEDOUT: 2834 case CPL_ERR_FINWAIT2_TIMEDOUT: 2835 case CPL_ERR_KEEPALIVE_TIMEDOUT: 2836#if 0 2837 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); 2838#endif 2839 return (ETIMEDOUT); 2840 default: 2841 return (EIO); 2842 } 2843} 2844 2845static inline void 2846set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd) 2847{ 2848 struct cpl_abort_rpl *rpl = cplhdr(m); 2849 2850 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); 2851 rpl->wr.wr_lo = htonl(V_WR_TID(tid)); 2852 m->m_len = m->m_pkthdr.len = sizeof(*rpl); 2853 2854 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); 2855 rpl->cmd = cmd; 2856} 2857 2858static void 2859send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m) 2860{ 2861 struct mbuf *reply_mbuf; 2862 struct cpl_abort_req_rss *req = cplhdr(m); 2863 2864 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl)); 2865 m_set_priority(m, CPL_PRIORITY_DATA); 2866 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl); 2867 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status); 2868 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2869 m_free(m); 2870} 2871 2872/* 2873 * Returns whether an ABORT_REQ_RSS message is a negative advice. 2874 */ 2875static inline int 2876is_neg_adv_abort(unsigned int status) 2877{ 2878 return status == CPL_ERR_RTX_NEG_ADVICE || 2879 status == CPL_ERR_PERSIST_NEG_ADVICE; 2880} 2881 2882static void 2883send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status) 2884{ 2885 struct mbuf *reply_mbuf; 2886 struct cpl_abort_req_rss *req = cplhdr(m); 2887 2888 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2889 2890 if (!reply_mbuf) { 2891 /* Defer the reply. Stick rst_status into req->cmd. */ 2892 req->status = rst_status; 2893 t3_defer_reply(m, tdev, send_deferred_abort_rpl); 2894 return; 2895 } 2896 2897 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA); 2898 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status); 2899 m_free(m); 2900 2901 /* 2902 * XXX need to sync with ARP as for SYN_RECV connections we can send 2903 * these messages while ARP is pending. For other connection states 2904 * it's not a problem. 2905 */ 2906 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2907} 2908 2909#ifdef notyet 2910static void 2911cleanup_syn_rcv_conn(struct socket *child, struct socket *parent) 2912{ 2913 CXGB_UNIMPLEMENTED(); 2914#ifdef notyet 2915 struct request_sock *req = child->sk_user_data; 2916 2917 inet_csk_reqsk_queue_removed(parent, req); 2918 synq_remove(tcp_sk(child)); 2919 __reqsk_free(req); 2920 child->sk_user_data = NULL; 2921#endif 2922} 2923 2924 2925/* 2926 * Performs the actual work to abort a SYN_RECV connection. 2927 */ 2928static void 2929do_abort_syn_rcv(struct socket *child, struct socket *parent) 2930{ 2931 struct tcpcb *parenttp = so_sototcpcb(parent); 2932 struct tcpcb *childtp = so_sototcpcb(child); 2933 2934 /* 2935 * If the server is still open we clean up the child connection, 2936 * otherwise the server already did the clean up as it was purging 2937 * its SYN queue and the skb was just sitting in its backlog. 2938 */ 2939 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) { 2940 cleanup_syn_rcv_conn(child, parent); 2941 inp_wlock(childtp->t_inpcb); 2942 t3_release_offload_resources(childtp->t_toe); 2943 inp_wunlock(childtp->t_inpcb); 2944 tcp_offload_close(childtp); 2945 } 2946} 2947#endif 2948 2949/* 2950 * Handle abort requests for a SYN_RECV connection. These need extra work 2951 * because the socket is on its parent's SYN queue. 2952 */ 2953static int 2954abort_syn_rcv(struct socket *so, struct mbuf *m) 2955{ 2956 CXGB_UNIMPLEMENTED(); 2957#ifdef notyet 2958 struct socket *parent; 2959 struct toedev *tdev = toep->tp_toedev; 2960 struct t3cdev *cdev = TOM_DATA(tdev)->cdev; 2961 struct socket *oreq = so->so_incomp; 2962 struct t3c_tid_entry *t3c_stid; 2963 struct tid_info *t; 2964 2965 if (!oreq) 2966 return -1; /* somehow we are not on the SYN queue */ 2967 2968 t = &(T3C_DATA(cdev))->tid_maps; 2969 t3c_stid = lookup_stid(t, oreq->ts_recent); 2970 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 2971 2972 so_lock(parent); 2973 do_abort_syn_rcv(so, parent); 2974 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST); 2975 so_unlock(parent); 2976#endif 2977 return (0); 2978} 2979 2980/* 2981 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this 2982 * request except that we need to reply to it. 2983 */ 2984static void 2985process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev) 2986{ 2987 int rst_status = CPL_ABORT_NO_RST; 2988 const struct cpl_abort_req_rss *req = cplhdr(m); 2989 struct tcpcb *tp = toep->tp_tp; 2990 struct socket *so; 2991 int needclose = 0; 2992 2993 inp_wlock(tp->t_inpcb); 2994 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 2995 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) { 2996 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN); 2997 m_free(m); 2998 goto skip; 2999 } 3000 3001 toep->tp_flags &= ~TP_ABORT_REQ_RCVD; 3002 /* 3003 * Three cases to consider: 3004 * a) We haven't sent an abort_req; close the connection. 3005 * b) We have sent a post-close abort_req that will get to TP too late 3006 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will 3007 * be ignored and the connection should be closed now. 3008 * c) We have sent a regular abort_req that will get to TP too late. 3009 * That will generate an abort_rpl with status 0, wait for it. 3010 */ 3011 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) || 3012 (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) { 3013 int error; 3014 3015 error = abort_status_to_errno(so, req->status, 3016 &rst_status); 3017 so_error_set(so, error); 3018 3019 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) 3020 so_sorwakeup(so); 3021 /* 3022 * SYN_RECV needs special processing. If abort_syn_rcv() 3023 * returns 0 is has taken care of the abort. 3024 */ 3025 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m)) 3026 goto skip; 3027 3028 t3_release_offload_resources(toep); 3029 needclose = 1; 3030 } 3031 inp_wunlock(tp->t_inpcb); 3032 3033 if (needclose) 3034 tcp_offload_close(tp); 3035 3036 send_abort_rpl(m, tdev, rst_status); 3037 return; 3038skip: 3039 inp_wunlock(tp->t_inpcb); 3040} 3041 3042/* 3043 * Handle an ABORT_REQ_RSS CPL message. 3044 */ 3045static int 3046do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3047{ 3048 const struct cpl_abort_req_rss *req = cplhdr(m); 3049 struct toepcb *toep = (struct toepcb *)ctx; 3050 3051 if (is_neg_adv_abort(req->status)) { 3052 m_free(m); 3053 return (0); 3054 } 3055 3056 log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid); 3057 3058 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) { 3059 cxgb_remove_tid(cdev, toep, toep->tp_tid); 3060 toep->tp_flags |= TP_ABORT_REQ_RCVD; 3061 3062 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST); 3063 if (toep->tp_l2t) 3064 l2t_release(L2DATA(cdev), toep->tp_l2t); 3065 3066 /* 3067 * Unhook 3068 */ 3069 toep->tp_tp->t_toe = NULL; 3070 toep->tp_tp->t_flags &= ~TF_TOE; 3071 toep->tp_tp = NULL; 3072 /* 3073 * XXX need to call syncache_chkrst - but we don't 3074 * have a way of doing that yet 3075 */ 3076 toepcb_release(toep); 3077 log(LOG_ERR, "abort for unestablished connection :-(\n"); 3078 return (0); 3079 } 3080 if (toep->tp_tp == NULL) { 3081 log(LOG_NOTICE, "disconnected toepcb\n"); 3082 /* should be freed momentarily */ 3083 return (0); 3084 } 3085 3086 3087 toepcb_hold(toep); 3088 process_abort_req(toep, m, toep->tp_toedev); 3089 toepcb_release(toep); 3090 return (0); 3091} 3092#ifdef notyet 3093static void 3094pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m) 3095{ 3096 struct toedev *tdev = TOE_DEV(parent); 3097 3098 do_abort_syn_rcv(child, parent); 3099 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) { 3100 struct cpl_pass_accept_rpl *rpl = cplhdr(m); 3101 3102 rpl->opt0h = htonl(F_TCAM_BYPASS); 3103 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 3104 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3105 } else 3106 m_free(m); 3107} 3108#endif 3109static void 3110handle_pass_open_arp_failure(struct socket *so, struct mbuf *m) 3111{ 3112 CXGB_UNIMPLEMENTED(); 3113 3114#ifdef notyet 3115 struct t3cdev *cdev; 3116 struct socket *parent; 3117 struct socket *oreq; 3118 struct t3c_tid_entry *t3c_stid; 3119 struct tid_info *t; 3120 struct tcpcb *otp, *tp = so_sototcpcb(so); 3121 struct toepcb *toep = tp->t_toe; 3122 3123 /* 3124 * If the connection is being aborted due to the parent listening 3125 * socket going away there's nothing to do, the ABORT_REQ will close 3126 * the connection. 3127 */ 3128 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 3129 m_free(m); 3130 return; 3131 } 3132 3133 oreq = so->so_incomp; 3134 otp = so_sototcpcb(oreq); 3135 3136 cdev = T3C_DEV(so); 3137 t = &(T3C_DATA(cdev))->tid_maps; 3138 t3c_stid = lookup_stid(t, otp->ts_recent); 3139 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 3140 3141 so_lock(parent); 3142 pass_open_abort(so, parent, m); 3143 so_unlock(parent); 3144#endif 3145} 3146 3147/* 3148 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly 3149 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV 3150 * connection. 3151 */ 3152static void 3153pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m) 3154{ 3155 3156#ifdef notyet 3157 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 3158 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk); 3159#endif 3160 handle_pass_open_arp_failure(m_get_socket(m), m); 3161} 3162 3163/* 3164 * Populate a reject CPL_PASS_ACCEPT_RPL WR. 3165 */ 3166static void 3167mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf) 3168{ 3169 struct cpl_pass_accept_req *req = cplhdr(req_mbuf); 3170 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf); 3171 unsigned int tid = GET_TID(req); 3172 3173 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP); 3174 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 3175 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 3176 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet 3177 rpl->opt0h = htonl(F_TCAM_BYPASS); 3178 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 3179 rpl->opt2 = 0; 3180 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 3181} 3182 3183/* 3184 * Send a deferred reject to an accept request. 3185 */ 3186static void 3187reject_pass_request(struct toedev *tdev, struct mbuf *m) 3188{ 3189 struct mbuf *reply_mbuf; 3190 3191 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl)); 3192 mk_pass_accept_rpl(reply_mbuf, m); 3193 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 3194 m_free(m); 3195} 3196 3197static void 3198handle_syncache_event(int event, void *arg) 3199{ 3200 struct toepcb *toep = arg; 3201 3202 switch (event) { 3203 case TOE_SC_ENTRY_PRESENT: 3204 /* 3205 * entry already exists - free toepcb 3206 * and l2t 3207 */ 3208 printf("syncache entry present\n"); 3209 toepcb_release(toep); 3210 break; 3211 case TOE_SC_DROP: 3212 /* 3213 * The syncache has given up on this entry 3214 * either it timed out, or it was evicted 3215 * we need to explicitly release the tid 3216 */ 3217 printf("syncache entry dropped\n"); 3218 toepcb_release(toep); 3219 break; 3220 default: 3221 log(LOG_ERR, "unknown syncache event %d\n", event); 3222 break; 3223 } 3224} 3225 3226static void 3227syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep) 3228{ 3229 struct in_conninfo inc; 3230 struct tcpopt to; 3231 struct tcphdr th; 3232 struct inpcb *inp; 3233 int mss, wsf, sack, ts; 3234 uint32_t rcv_isn = ntohl(req->rcv_isn); 3235 3236 bzero(&to, sizeof(struct tcpopt)); 3237 inp = so_sotoinpcb(lso); 3238 3239 /* 3240 * Fill out information for entering us into the syncache 3241 */ 3242 inc.inc_fport = th.th_sport = req->peer_port; 3243 inc.inc_lport = th.th_dport = req->local_port; 3244 th.th_seq = req->rcv_isn; 3245 th.th_flags = TH_SYN; 3246 3247 toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1; 3248 3249 3250 inc.inc_isipv6 = 0; 3251 inc.inc_len = 0; 3252 inc.inc_faddr.s_addr = req->peer_ip; 3253 inc.inc_laddr.s_addr = req->local_ip; 3254 3255 DPRINTF("syncache add of %d:%d %d:%d\n", 3256 ntohl(req->local_ip), ntohs(req->local_port), 3257 ntohl(req->peer_ip), ntohs(req->peer_port)); 3258 3259 mss = req->tcp_options.mss; 3260 wsf = req->tcp_options.wsf; 3261 ts = req->tcp_options.tstamp; 3262 sack = req->tcp_options.sack; 3263 to.to_mss = mss; 3264 to.to_wscale = wsf; 3265 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 3266 syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep); 3267} 3268 3269 3270/* 3271 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket 3272 * lock held. Note that the sock here is a listening socket that is not owned 3273 * by the TOE. 3274 */ 3275static void 3276process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, 3277 struct listen_ctx *lctx) 3278{ 3279 int rt_flags; 3280 struct l2t_entry *e; 3281 struct iff_mac tim; 3282 struct mbuf *reply_mbuf, *ddp_mbuf = NULL; 3283 struct cpl_pass_accept_rpl *rpl; 3284 struct cpl_pass_accept_req *req = cplhdr(m); 3285 unsigned int tid = GET_TID(req); 3286 struct tom_data *d = TOM_DATA(tdev); 3287 struct t3cdev *cdev = d->cdev; 3288 struct tcpcb *tp = so_sototcpcb(so); 3289 struct toepcb *newtoep; 3290 struct rtentry *dst; 3291 struct sockaddr_in nam; 3292 struct t3c_data *td = T3C_DATA(cdev); 3293 3294 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 3295 if (__predict_false(reply_mbuf == NULL)) { 3296 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 3297 t3_defer_reply(m, tdev, reject_pass_request); 3298 else { 3299 cxgb_queue_tid_release(cdev, tid); 3300 m_free(m); 3301 } 3302 DPRINTF("failed to get reply_mbuf\n"); 3303 3304 goto out; 3305 } 3306 3307 if (tp->t_state != TCPS_LISTEN) { 3308 DPRINTF("socket not in listen state\n"); 3309 3310 goto reject; 3311 } 3312 3313 tim.mac_addr = req->dst_mac; 3314 tim.vlan_tag = ntohs(req->vlan_tag); 3315 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { 3316 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n"); 3317 goto reject; 3318 } 3319 3320#ifdef notyet 3321 /* 3322 * XXX do route lookup to confirm that we're still listening on this 3323 * address 3324 */ 3325 if (ip_route_input(skb, req->local_ip, req->peer_ip, 3326 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev)) 3327 goto reject; 3328 rt_flags = ((struct rtable *)skb->dst)->rt_flags & 3329 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); 3330 dst_release(skb->dst); // done with the input route, release it 3331 skb->dst = NULL; 3332 3333 if ((rt_flags & RTF_LOCAL) == 0) 3334 goto reject; 3335#endif 3336 /* 3337 * XXX 3338 */ 3339 rt_flags = RTF_LOCAL; 3340 if ((rt_flags & RTF_LOCAL) == 0) 3341 goto reject; 3342 3343 /* 3344 * Calculate values and add to syncache 3345 */ 3346 3347 newtoep = toepcb_alloc(); 3348 if (newtoep == NULL) 3349 goto reject; 3350 3351 bzero(&nam, sizeof(struct sockaddr_in)); 3352 3353 nam.sin_len = sizeof(struct sockaddr_in); 3354 nam.sin_family = AF_INET; 3355 nam.sin_addr.s_addr =req->peer_ip; 3356 dst = rtalloc2((struct sockaddr *)&nam, 1, 0); 3357 3358 if (dst == NULL) { 3359 printf("failed to find route\n"); 3360 goto reject; 3361 } 3362 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev, 3363 (struct sockaddr *)&nam); 3364 if (e == NULL) { 3365 DPRINTF("failed to get l2t\n"); 3366 } 3367 /* 3368 * Point to our listen socket until accept 3369 */ 3370 newtoep->tp_tp = tp; 3371 newtoep->tp_flags = TP_SYN_RCVD; 3372 newtoep->tp_tid = tid; 3373 newtoep->tp_toedev = tdev; 3374 tp->rcv_wnd = select_rcv_wnd(tdev, so); 3375 3376 cxgb_insert_tid(cdev, d->client, newtoep, tid); 3377 so_lock(so); 3378 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry); 3379 so_unlock(so); 3380 3381 newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) && 3382 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 3383 3384 if (newtoep->tp_ulp_mode) { 3385 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 3386 3387 if (ddp_mbuf == NULL) 3388 newtoep->tp_ulp_mode = 0; 3389 } 3390 3391 CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d", 3392 TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode); 3393 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure); 3394 /* 3395 * XXX workaround for lack of syncache drop 3396 */ 3397 toepcb_hold(newtoep); 3398 syncache_add_accept_req(req, so, newtoep); 3399 3400 rpl = cplhdr(reply_mbuf); 3401 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl); 3402 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 3403 rpl->wr.wr_lo = 0; 3404 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 3405 rpl->opt2 = htonl(calc_opt2(so, tdev)); 3406 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 3407 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten 3408 3409 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) | 3410 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); 3411 rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) | 3412 CPL_PASS_OPEN_ACCEPT); 3413 3414 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status); 3415 3416 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep)); 3417 3418 l2t_send(cdev, reply_mbuf, e); 3419 m_free(m); 3420 if (newtoep->tp_ulp_mode) { 3421 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS, 3422 V_TF_DDP_OFF(1) | 3423 TP_DDP_TIMER_WORKAROUND_MASK, 3424 V_TF_DDP_OFF(1) | 3425 TP_DDP_TIMER_WORKAROUND_VAL, 1); 3426 } else 3427 printf("not offloading\n"); 3428 3429 3430 3431 return; 3432reject: 3433 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 3434 mk_pass_accept_rpl(reply_mbuf, m); 3435 else 3436 mk_tid_release(reply_mbuf, newtoep, tid); 3437 cxgb_ofld_send(cdev, reply_mbuf); 3438 m_free(m); 3439out: 3440#if 0 3441 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 3442#else 3443 return; 3444#endif 3445} 3446 3447/* 3448 * Handle a CPL_PASS_ACCEPT_REQ message. 3449 */ 3450static int 3451do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3452{ 3453 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; 3454 struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */ 3455 struct tom_data *d = listen_ctx->tom_data; 3456 3457#if VALIDATE_TID 3458 struct cpl_pass_accept_req *req = cplhdr(m); 3459 unsigned int tid = GET_TID(req); 3460 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; 3461 3462 if (unlikely(!lsk)) { 3463 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n", 3464 cdev->name, 3465 (unsigned long)((union listen_entry *)ctx - 3466 t->stid_tab)); 3467 return CPL_RET_BUF_DONE; 3468 } 3469 if (unlikely(tid >= t->ntids)) { 3470 printk(KERN_ERR "%s: passive open TID %u too large\n", 3471 cdev->name, tid); 3472 return CPL_RET_BUF_DONE; 3473 } 3474 /* 3475 * For T3A the current user of the TID may have closed but its last 3476 * message(s) may have been backlogged so the TID appears to be still 3477 * in use. Just take the TID away, the connection can close at its 3478 * own leisure. For T3B this situation is a bug. 3479 */ 3480 if (!valid_new_tid(t, tid) && 3481 cdev->type != T3A) { 3482 printk(KERN_ERR "%s: passive open uses existing TID %u\n", 3483 cdev->name, tid); 3484 return CPL_RET_BUF_DONE; 3485 } 3486#endif 3487 3488 process_pass_accept_req(lso, m, &d->tdev, listen_ctx); 3489 return (0); 3490} 3491 3492/* 3493 * Called when a connection is established to translate the TCP options 3494 * reported by HW to FreeBSD's native format. 3495 */ 3496static void 3497assign_rxopt(struct socket *so, unsigned int opt) 3498{ 3499 struct tcpcb *tp = so_sototcpcb(so); 3500 struct toepcb *toep = tp->t_toe; 3501 const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep)); 3502 3503 inp_lock_assert(tp->t_inpcb); 3504 3505 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40; 3506 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0; 3507 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0; 3508 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0; 3509 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3510 (TF_RCVD_SCALE|TF_REQ_SCALE)) 3511 tp->rcv_scale = tp->request_r_scale; 3512} 3513 3514/* 3515 * Completes some final bits of initialization for just established connections 3516 * and changes their state to TCP_ESTABLISHED. 3517 * 3518 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. 3519 */ 3520static void 3521make_established(struct socket *so, u32 snd_isn, unsigned int opt) 3522{ 3523 struct tcpcb *tp = so_sototcpcb(so); 3524 struct toepcb *toep = tp->t_toe; 3525 3526 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; 3527 assign_rxopt(so, opt); 3528 3529 /* 3530 *XXXXXXXXXXX 3531 * 3532 */ 3533#ifdef notyet 3534 so->so_proto->pr_ctloutput = t3_ctloutput; 3535#endif 3536 3537#if 0 3538 inet_sk(sk)->id = tp->write_seq ^ jiffies; 3539#endif 3540 /* 3541 * XXX not clear what rcv_wup maps to 3542 */ 3543 /* 3544 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't 3545 * pass through opt0. 3546 */ 3547 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) 3548 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); 3549 3550 dump_toepcb(toep); 3551 3552#ifdef notyet 3553/* 3554 * no clean interface for marking ARP up to date 3555 */ 3556 dst_confirm(sk->sk_dst_cache); 3557#endif 3558 tp->t_starttime = ticks; 3559 tp->t_state = TCPS_ESTABLISHED; 3560 soisconnected(so); 3561} 3562 3563static int 3564syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep) 3565{ 3566 3567 struct in_conninfo inc; 3568 struct tcpopt to; 3569 struct tcphdr th; 3570 int mss, wsf, sack, ts; 3571 struct mbuf *m = NULL; 3572 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev); 3573 unsigned int opt; 3574 3575#ifdef MAC 3576#error "no MAC support" 3577#endif 3578 3579 opt = ntohs(req->tcp_opt); 3580 3581 bzero(&to, sizeof(struct tcpopt)); 3582 3583 /* 3584 * Fill out information for entering us into the syncache 3585 */ 3586 inc.inc_fport = th.th_sport = req->peer_port; 3587 inc.inc_lport = th.th_dport = req->local_port; 3588 th.th_seq = req->rcv_isn; 3589 th.th_flags = TH_ACK; 3590 3591 inc.inc_isipv6 = 0; 3592 inc.inc_len = 0; 3593 inc.inc_faddr.s_addr = req->peer_ip; 3594 inc.inc_laddr.s_addr = req->local_ip; 3595 3596 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40; 3597 wsf = G_TCPOPT_WSCALE_OK(opt); 3598 ts = G_TCPOPT_TSTAMP(opt); 3599 sack = G_TCPOPT_SACK(opt); 3600 3601 to.to_mss = mss; 3602 to.to_wscale = G_TCPOPT_SND_WSCALE(opt); 3603 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 3604 3605 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n", 3606 ntohl(req->local_ip), ntohs(req->local_port), 3607 ntohl(req->peer_ip), ntohs(req->peer_port), 3608 mss, wsf, ts, sack); 3609 return syncache_offload_expand(&inc, &to, &th, so, m); 3610} 3611 3612 3613/* 3614 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work 3615 * if we are in TCP_SYN_RECV due to crossed SYNs 3616 */ 3617static int 3618do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3619{ 3620 struct cpl_pass_establish *req = cplhdr(m); 3621 struct toepcb *toep = (struct toepcb *)ctx; 3622 struct tcpcb *tp = toep->tp_tp; 3623 struct socket *so, *lso; 3624 struct t3c_data *td = T3C_DATA(cdev); 3625 struct sockbuf *snd, *rcv; 3626 3627 // Complete socket initialization now that we have the SND_ISN 3628 3629 struct toedev *tdev; 3630 3631 3632 tdev = toep->tp_toedev; 3633 3634 inp_wlock(tp->t_inpcb); 3635 3636 /* 3637 * 3638 * XXX need to add reference while we're manipulating 3639 */ 3640 so = lso = inp_inpcbtosocket(tp->t_inpcb); 3641 3642 inp_wunlock(tp->t_inpcb); 3643 3644 snd = so_sockbuf_snd(so); 3645 rcv = so_sockbuf_rcv(so); 3646 3647 3648 so_lock(so); 3649 LIST_REMOVE(toep, synq_entry); 3650 so_unlock(so); 3651 3652 if (!syncache_expand_establish_req(req, &so, toep)) { 3653 /* 3654 * No entry 3655 */ 3656 CXGB_UNIMPLEMENTED(); 3657 } 3658 if (so == NULL) { 3659 /* 3660 * Couldn't create the socket 3661 */ 3662 CXGB_UNIMPLEMENTED(); 3663 } 3664 3665 tp = so_sototcpcb(so); 3666 inp_wlock(tp->t_inpcb); 3667 3668 3669 snd->sb_flags |= SB_NOCOALESCE; 3670 rcv->sb_flags |= SB_NOCOALESCE; 3671 3672 toep->tp_tp = tp; 3673 toep->tp_flags = 0; 3674 tp->t_toe = toep; 3675 reset_wr_list(toep); 3676 tp->rcv_wnd = select_rcv_wnd(tdev, so); 3677 tp->rcv_nxt = toep->tp_copied_seq; 3678 install_offload_ops(so); 3679 3680 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs); 3681 toep->tp_wr_unacked = 0; 3682 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3683 toep->tp_qset_idx = 0; 3684 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu); 3685 3686 /* 3687 * XXX Cancel any keep alive timer 3688 */ 3689 3690 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3691 3692 /* 3693 * XXX workaround for lack of syncache drop 3694 */ 3695 toepcb_release(toep); 3696 inp_wunlock(tp->t_inpcb); 3697 3698 CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid); 3699 cxgb_log_tcb(cdev->adapter, toep->tp_tid); 3700#ifdef notyet 3701 /* 3702 * XXX not sure how these checks map to us 3703 */ 3704 if (unlikely(sk->sk_socket)) { // simultaneous opens only 3705 sk->sk_state_change(sk); 3706 sk_wake_async(so, 0, POLL_OUT); 3707 } 3708 /* 3709 * The state for the new connection is now up to date. 3710 * Next check if we should add the connection to the parent's 3711 * accept queue. When the parent closes it resets connections 3712 * on its SYN queue, so check if we are being reset. If so we 3713 * don't need to do anything more, the coming ABORT_RPL will 3714 * destroy this socket. Otherwise move the connection to the 3715 * accept queue. 3716 * 3717 * Note that we reset the synq before closing the server so if 3718 * we are not being reset the stid is still open. 3719 */ 3720 if (unlikely(!tp->forward_skb_hint)) { // removed from synq 3721 __kfree_skb(skb); 3722 goto unlock; 3723 } 3724#endif 3725 m_free(m); 3726 3727 return (0); 3728} 3729 3730/* 3731 * Fill in the right TID for CPL messages waiting in the out-of-order queue 3732 * and send them to the TOE. 3733 */ 3734static void 3735fixup_and_send_ofo(struct toepcb *toep) 3736{ 3737 struct mbuf *m; 3738 struct toedev *tdev = toep->tp_toedev; 3739 struct tcpcb *tp = toep->tp_tp; 3740 unsigned int tid = toep->tp_tid; 3741 3742 log(LOG_NOTICE, "fixup_and_send_ofo\n"); 3743 3744 inp_lock_assert(tp->t_inpcb); 3745 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { 3746 /* 3747 * A variety of messages can be waiting but the fields we'll 3748 * be touching are common to all so any message type will do. 3749 */ 3750 struct cpl_close_con_req *p = cplhdr(m); 3751 3752 p->wr.wr_lo = htonl(V_WR_TID(tid)); 3753 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); 3754 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3755 } 3756} 3757 3758/* 3759 * Updates socket state from an active establish CPL message. Runs with the 3760 * socket lock held. 3761 */ 3762static void 3763socket_act_establish(struct socket *so, struct mbuf *m) 3764{ 3765 struct cpl_act_establish *req = cplhdr(m); 3766 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ 3767 struct tcpcb *tp = so_sototcpcb(so); 3768 struct toepcb *toep = tp->t_toe; 3769 3770 if (__predict_false(tp->t_state != TCPS_SYN_SENT)) 3771 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n", 3772 toep->tp_tid, tp->t_state); 3773 3774 tp->ts_recent_age = ticks; 3775 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn; 3776 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs; 3777 3778 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3779 3780 /* 3781 * Now that we finally have a TID send any CPL messages that we had to 3782 * defer for lack of a TID. 3783 */ 3784 if (mbufq_len(&toep->out_of_order_queue)) 3785 fixup_and_send_ofo(toep); 3786 3787 if (__predict_false(so_state_get(so) & SS_NOFDREF)) { 3788 /* 3789 * XXX does this even make sense? 3790 */ 3791 so_sorwakeup(so); 3792 } 3793 m_free(m); 3794#ifdef notyet 3795/* 3796 * XXX assume no write requests permitted while socket connection is 3797 * incomplete 3798 */ 3799 /* 3800 * Currently the send queue must be empty at this point because the 3801 * socket layer does not send anything before a connection is 3802 * established. To be future proof though we handle the possibility 3803 * that there are pending buffers to send (either TX_DATA or 3804 * CLOSE_CON_REQ). First we need to adjust the sequence number of the 3805 * buffers according to the just learned write_seq, and then we send 3806 * them on their way. 3807 */ 3808 fixup_pending_writeq_buffers(sk); 3809 if (t3_push_frames(so, 1)) 3810 sk->sk_write_space(sk); 3811#endif 3812 3813 toep->tp_state = tp->t_state; 3814 tcpstat.tcps_connects++; 3815 3816} 3817 3818/* 3819 * Process a CPL_ACT_ESTABLISH message. 3820 */ 3821static int 3822do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3823{ 3824 struct cpl_act_establish *req = cplhdr(m); 3825 unsigned int tid = GET_TID(req); 3826 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 3827 struct toepcb *toep = (struct toepcb *)ctx; 3828 struct tcpcb *tp = toep->tp_tp; 3829 struct socket *so; 3830 struct toedev *tdev; 3831 struct tom_data *d; 3832 3833 if (tp == NULL) { 3834 free_atid(cdev, atid); 3835 return (0); 3836 } 3837 inp_wlock(tp->t_inpcb); 3838 3839 /* 3840 * XXX 3841 */ 3842 so = inp_inpcbtosocket(tp->t_inpcb); 3843 tdev = toep->tp_toedev; /* blow up here if link was down */ 3844 d = TOM_DATA(tdev); 3845 3846 /* 3847 * It's OK if the TID is currently in use, the owning socket may have 3848 * backlogged its last CPL message(s). Just take it away. 3849 */ 3850 toep->tp_tid = tid; 3851 toep->tp_tp = tp; 3852 so_insert_tid(d, toep, tid); 3853 free_atid(cdev, atid); 3854 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3855 3856 socket_act_establish(so, m); 3857 inp_wunlock(tp->t_inpcb); 3858 CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid); 3859 cxgb_log_tcb(cdev->adapter, toep->tp_tid); 3860 3861 return (0); 3862} 3863 3864/* 3865 * Process an acknowledgment of WR completion. Advance snd_una and send the 3866 * next batch of work requests from the write queue. 3867 */ 3868static void 3869wr_ack(struct toepcb *toep, struct mbuf *m) 3870{ 3871 struct tcpcb *tp = toep->tp_tp; 3872 struct cpl_wr_ack *hdr = cplhdr(m); 3873 struct socket *so; 3874 unsigned int credits = ntohs(hdr->credits); 3875 u32 snd_una = ntohl(hdr->snd_una); 3876 int bytes = 0; 3877 struct sockbuf *snd; 3878 3879 CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits); 3880 3881 inp_wlock(tp->t_inpcb); 3882 so = inp_inpcbtosocket(tp->t_inpcb); 3883 toep->tp_wr_avail += credits; 3884 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) 3885 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; 3886 3887 while (credits) { 3888 struct mbuf *p = peek_wr(toep); 3889 3890 if (__predict_false(!p)) { 3891 log(LOG_ERR, "%u WR_ACK credits for TID %u with " 3892 "nothing pending, state %u wr_avail=%u\n", 3893 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); 3894 break; 3895 } 3896 CTR2(KTR_TOM, 3897 "wr_ack: p->credits=%d p->bytes=%d", 3898 p->m_pkthdr.csum_data, p->m_pkthdr.len); 3899 KASSERT(p->m_pkthdr.csum_data != 0, 3900 ("empty request still on list")); 3901 3902 if (__predict_false(credits < p->m_pkthdr.csum_data)) { 3903 3904#if DEBUG_WR > 1 3905 struct tx_data_wr *w = cplhdr(p); 3906 log(LOG_ERR, 3907 "TID %u got %u WR credits, need %u, len %u, " 3908 "main body %u, frags %u, seq # %u, ACK una %u," 3909 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", 3910 toep->tp_tid, credits, p->csum, p->len, 3911 p->len - p->data_len, skb_shinfo(p)->nr_frags, 3912 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), 3913 toep->tp_wr_avail, count_pending_wrs(tp) - credits); 3914#endif 3915 p->m_pkthdr.csum_data -= credits; 3916 break; 3917 } else { 3918 dequeue_wr(toep); 3919 credits -= p->m_pkthdr.csum_data; 3920 bytes += p->m_pkthdr.len; 3921 CTR3(KTR_TOM, 3922 "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d", 3923 p->m_pkthdr.len, credits, p->m_pkthdr.csum_data); 3924 3925 m_free(p); 3926 } 3927 } 3928 3929#if DEBUG_WR 3930 check_wr_invariants(tp); 3931#endif 3932 3933 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 3934#if VALIDATE_SEQ 3935 struct tom_data *d = TOM_DATA(TOE_DEV(so)); 3936 3937 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK " 3938 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una, 3939 toep->tp_tid, tp->snd_una); 3940#endif 3941 goto out_free; 3942 } 3943 3944 if (tp->snd_una != snd_una) { 3945 tp->snd_una = snd_una; 3946 tp->ts_recent_age = ticks; 3947#ifdef notyet 3948 /* 3949 * Keep ARP entry "minty fresh" 3950 */ 3951 dst_confirm(sk->sk_dst_cache); 3952#endif 3953 if (tp->snd_una == tp->snd_nxt) 3954 toep->tp_flags &= ~TP_TX_WAIT_IDLE; 3955 } 3956 3957 snd = so_sockbuf_snd(so); 3958 if (bytes) { 3959 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes); 3960 snd = so_sockbuf_snd(so); 3961 sockbuf_lock(snd); 3962 sbdrop_locked(snd, bytes); 3963 so_sowwakeup_locked(so); 3964 } 3965 3966 if (snd->sb_sndptroff < snd->sb_cc) 3967 t3_push_frames(so, 0); 3968 3969out_free: 3970 inp_wunlock(tp->t_inpcb); 3971 m_free(m); 3972} 3973 3974/* 3975 * Handler for TX_DATA_ACK CPL messages. 3976 */ 3977static int 3978do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx) 3979{ 3980 struct toepcb *toep = (struct toepcb *)ctx; 3981 3982 VALIDATE_SOCK(so); 3983 3984 wr_ack(toep, m); 3985 return 0; 3986} 3987 3988/* 3989 * Handler for TRACE_PKT CPL messages. Just sink these packets. 3990 */ 3991static int 3992do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx) 3993{ 3994 m_freem(m); 3995 return 0; 3996} 3997 3998/* 3999 * Reset a connection that is on a listener's SYN queue or accept queue, 4000 * i.e., one that has not had a struct socket associated with it. 4001 * Must be called from process context. 4002 * 4003 * Modeled after code in inet_csk_listen_stop(). 4004 */ 4005static void 4006t3_reset_listen_child(struct socket *child) 4007{ 4008 struct tcpcb *tp = so_sototcpcb(child); 4009 4010 t3_send_reset(tp->t_toe); 4011} 4012 4013 4014static void 4015t3_child_disconnect(struct socket *so, void *arg) 4016{ 4017 struct tcpcb *tp = so_sototcpcb(so); 4018 4019 if (tp->t_flags & TF_TOE) { 4020 inp_wlock(tp->t_inpcb); 4021 t3_reset_listen_child(so); 4022 inp_wunlock(tp->t_inpcb); 4023 } 4024} 4025 4026/* 4027 * Disconnect offloaded established but not yet accepted connections sitting 4028 * on a server's accept_queue. We just send an ABORT_REQ at this point and 4029 * finish off the disconnect later as we may need to wait for the ABORT_RPL. 4030 */ 4031void 4032t3_disconnect_acceptq(struct socket *listen_so) 4033{ 4034 4035 so_lock(listen_so); 4036 so_listeners_apply_all(listen_so, t3_child_disconnect, NULL); 4037 so_unlock(listen_so); 4038} 4039 4040/* 4041 * Reset offloaded connections sitting on a server's syn queue. As above 4042 * we send ABORT_REQ and finish off when we get ABORT_RPL. 4043 */ 4044 4045void 4046t3_reset_synq(struct listen_ctx *lctx) 4047{ 4048 struct toepcb *toep; 4049 4050 so_lock(lctx->lso); 4051 while (!LIST_EMPTY(&lctx->synq_head)) { 4052 toep = LIST_FIRST(&lctx->synq_head); 4053 LIST_REMOVE(toep, synq_entry); 4054 toep->tp_tp = NULL; 4055 t3_send_reset(toep); 4056 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid); 4057 toepcb_release(toep); 4058 } 4059 so_unlock(lctx->lso); 4060} 4061 4062 4063int 4064t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl, 4065 unsigned int nppods, unsigned int tag, unsigned int maxoff, 4066 unsigned int pg_off, unsigned int color) 4067{ 4068 unsigned int i, j, pidx; 4069 struct pagepod *p; 4070 struct mbuf *m; 4071 struct ulp_mem_io *req; 4072 unsigned int tid = toep->tp_tid; 4073 const struct tom_data *td = TOM_DATA(toep->tp_toedev); 4074 unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit; 4075 4076 CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)", 4077 gl, nppods, tag, maxoff, pg_off, color); 4078 4079 for (i = 0; i < nppods; ++i) { 4080 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE); 4081 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4082 req = mtod(m, struct ulp_mem_io *); 4083 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE; 4084 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4085 req->wr.wr_lo = 0; 4086 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) | 4087 V_ULPTX_CMD(ULP_MEM_WRITE)); 4088 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) | 4089 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1)); 4090 4091 p = (struct pagepod *)(req + 1); 4092 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) { 4093 p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid)); 4094 p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) | 4095 V_PPOD_COLOR(color)); 4096 p->pp_max_offset = htonl(maxoff); 4097 p->pp_page_offset = htonl(pg_off); 4098 p->pp_rsvd = 0; 4099 for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx) 4100 p->pp_addr[j] = pidx < gl->dgl_nelem ? 4101 htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0; 4102 } else 4103 p->pp_vld_tid = 0; /* mark sentinel page pods invalid */ 4104 send_or_defer(toep, m, 0); 4105 ppod_addr += PPOD_SIZE; 4106 } 4107 return (0); 4108} 4109 4110/* 4111 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command. 4112 */ 4113static inline void 4114mk_cpl_barrier_ulp(struct cpl_barrier *b) 4115{ 4116 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b; 4117 4118 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4119 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8)); 4120 b->opcode = CPL_BARRIER; 4121} 4122 4123/* 4124 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command. 4125 */ 4126static inline void 4127mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno) 4128{ 4129 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; 4130 4131 txpkt = (struct ulp_txpkt *)req; 4132 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4133 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); 4134 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid)); 4135 req->cpuno = htons(cpuno); 4136} 4137 4138/* 4139 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command. 4140 */ 4141static inline void 4142mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid, 4143 unsigned int word, uint64_t mask, uint64_t val) 4144{ 4145 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; 4146 4147 CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", 4148 tid, word, mask, val); 4149 4150 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4151 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); 4152 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); 4153 req->reply = V_NO_REPLY(1); 4154 req->cpu_idx = 0; 4155 req->word = htons(word); 4156 req->mask = htobe64(mask); 4157 req->val = htobe64(val); 4158} 4159 4160/* 4161 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command. 4162 */ 4163static void 4164mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack, 4165 unsigned int tid, unsigned int credits) 4166{ 4167 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack; 4168 4169 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4170 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8)); 4171 OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid)); 4172 ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | 4173 V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) | 4174 V_RX_CREDITS(credits)); 4175} 4176 4177void 4178t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx) 4179{ 4180 unsigned int wrlen; 4181 struct mbuf *m; 4182 struct work_request_hdr *wr; 4183 struct cpl_barrier *lock; 4184 struct cpl_set_tcb_field *req; 4185 struct cpl_get_tcb *getreq; 4186 struct ddp_state *p = &toep->tp_ddp_state; 4187 4188#if 0 4189 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4190#endif 4191 wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) + 4192 sizeof(*getreq); 4193 m = m_gethdr_nofail(wrlen); 4194 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4195 wr = mtod(m, struct work_request_hdr *); 4196 bzero(wr, wrlen); 4197 4198 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4199 m->m_pkthdr.len = m->m_len = wrlen; 4200 4201 lock = (struct cpl_barrier *)(wr + 1); 4202 mk_cpl_barrier_ulp(lock); 4203 4204 req = (struct cpl_set_tcb_field *)(lock + 1); 4205 4206 CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx); 4207 4208 /* Hmmm, not sure if this actually a good thing: reactivating 4209 * the other buffer might be an issue if it has been completed 4210 * already. However, that is unlikely, since the fact that the UBUF 4211 * is not completed indicates that there is no oustanding data. 4212 */ 4213 if (bufidx == 0) 4214 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4215 V_TF_DDP_ACTIVE_BUF(1) | 4216 V_TF_DDP_BUF0_VALID(1), 4217 V_TF_DDP_ACTIVE_BUF(1)); 4218 else 4219 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4220 V_TF_DDP_ACTIVE_BUF(1) | 4221 V_TF_DDP_BUF1_VALID(1), 0); 4222 4223 getreq = (struct cpl_get_tcb *)(req + 1); 4224 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); 4225 4226 mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1)); 4227 4228 /* Keep track of the number of oustanding CPL_GET_TCB requests 4229 */ 4230 p->get_tcb_count++; 4231 4232#ifdef T3_TRACE 4233 T3_TRACE1(TIDTB(so), 4234 "t3_cancel_ddpbuf: bufidx %u", bufidx); 4235#endif 4236 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4237} 4238 4239/** 4240 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one 4241 * @sk: the socket associated with the buffers 4242 * @bufidx: index of HW DDP buffer (0 or 1) 4243 * @tag0: new tag for HW buffer 0 4244 * @tag1: new tag for HW buffer 1 4245 * @len: new length for HW buf @bufidx 4246 * 4247 * Sends a compound WR to overlay a new DDP buffer on top of an existing 4248 * buffer by changing the buffer tag and length and setting the valid and 4249 * active flag accordingly. The caller must ensure the new buffer is at 4250 * least as big as the existing one. Since we typically reprogram both HW 4251 * buffers this function sets both tags for convenience. Read the TCB to 4252 * determine how made data was written into the buffer before the overlay 4253 * took place. 4254 */ 4255void 4256t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0, 4257 unsigned int tag1, unsigned int len) 4258{ 4259 unsigned int wrlen; 4260 struct mbuf *m; 4261 struct work_request_hdr *wr; 4262 struct cpl_get_tcb *getreq; 4263 struct cpl_set_tcb_field *req; 4264 struct ddp_state *p = &toep->tp_ddp_state; 4265 4266 CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)", 4267 bufidx, tag0, tag1, len); 4268#if 0 4269 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4270#endif 4271 wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq); 4272 m = m_gethdr_nofail(wrlen); 4273 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4274 wr = mtod(m, struct work_request_hdr *); 4275 m->m_pkthdr.len = m->m_len = wrlen; 4276 bzero(wr, wrlen); 4277 4278 4279 /* Set the ATOMIC flag to make sure that TP processes the following 4280 * CPLs in an atomic manner and no wire segments can be interleaved. 4281 */ 4282 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC); 4283 req = (struct cpl_set_tcb_field *)(wr + 1); 4284 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG, 4285 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) | 4286 V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32, 4287 V_TCB_RX_DDP_BUF0_TAG(tag0) | 4288 V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32); 4289 req++; 4290 if (bufidx == 0) { 4291 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN, 4292 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 4293 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 4294 req++; 4295 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4296 V_TF_DDP_PUSH_DISABLE_0(1) | 4297 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 4298 V_TF_DDP_PUSH_DISABLE_0(0) | 4299 V_TF_DDP_BUF0_VALID(1)); 4300 } else { 4301 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN, 4302 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN), 4303 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len)); 4304 req++; 4305 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4306 V_TF_DDP_PUSH_DISABLE_1(1) | 4307 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 4308 V_TF_DDP_PUSH_DISABLE_1(0) | 4309 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1)); 4310 } 4311 4312 getreq = (struct cpl_get_tcb *)(req + 1); 4313 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); 4314 4315 /* Keep track of the number of oustanding CPL_GET_TCB requests 4316 */ 4317 p->get_tcb_count++; 4318 4319#ifdef T3_TRACE 4320 T3_TRACE4(TIDTB(sk), 4321 "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u " 4322 "len %d", 4323 bufidx, tag0, tag1, len); 4324#endif 4325 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4326} 4327 4328/* 4329 * Sends a compound WR containing all the CPL messages needed to program the 4330 * two HW DDP buffers, namely optionally setting up the length and offset of 4331 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK. 4332 */ 4333void 4334t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0, 4335 unsigned int len1, unsigned int offset1, 4336 uint64_t ddp_flags, uint64_t flag_mask, int modulate) 4337{ 4338 unsigned int wrlen; 4339 struct mbuf *m; 4340 struct work_request_hdr *wr; 4341 struct cpl_set_tcb_field *req; 4342 4343 CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ", 4344 len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff); 4345 4346#if 0 4347 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4348#endif 4349 wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) + 4350 (len1 ? sizeof(*req) : 0) + 4351 (modulate ? sizeof(struct cpl_rx_data_ack) : 0); 4352 m = m_gethdr_nofail(wrlen); 4353 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4354 wr = mtod(m, struct work_request_hdr *); 4355 bzero(wr, wrlen); 4356 4357 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4358 m->m_pkthdr.len = m->m_len = wrlen; 4359 4360 req = (struct cpl_set_tcb_field *)(wr + 1); 4361 if (len0) { /* program buffer 0 offset and length */ 4362 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET, 4363 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 4364 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 4365 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) | 4366 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0)); 4367 req++; 4368 } 4369 if (len1) { /* program buffer 1 offset and length */ 4370 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET, 4371 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 4372 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32, 4373 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) | 4374 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32); 4375 req++; 4376 } 4377 4378 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask, 4379 ddp_flags); 4380 4381 if (modulate) { 4382 mk_rx_data_ack_ulp(toep, 4383 (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid, 4384 toep->tp_copied_seq - toep->tp_rcv_wup); 4385 toep->tp_rcv_wup = toep->tp_copied_seq; 4386 } 4387 4388#ifdef T3_TRACE 4389 T3_TRACE5(TIDTB(sk), 4390 "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x " 4391 "modulate %d", 4392 len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff, 4393 modulate); 4394#endif 4395 4396 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4397} 4398 4399void 4400t3_init_wr_tab(unsigned int wr_len) 4401{ 4402 int i; 4403 4404 if (mbuf_wrs[1]) /* already initialized */ 4405 return; 4406 4407 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) { 4408 int sgl_len = (3 * i) / 2 + (i & 1); 4409 4410 sgl_len += 3; 4411 mbuf_wrs[i] = sgl_len <= wr_len ? 4412 1 : 1 + (sgl_len - 2) / (wr_len - 1); 4413 } 4414 4415 wrlen = wr_len * 8; 4416} 4417 4418int 4419t3_init_cpl_io(void) 4420{ 4421#ifdef notyet 4422 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); 4423 if (!tcphdr_skb) { 4424 log(LOG_ERR, 4425 "Chelsio TCP offload: can't allocate sk_buff\n"); 4426 return -1; 4427 } 4428 skb_put(tcphdr_skb, sizeof(struct tcphdr)); 4429 tcphdr_skb->h.raw = tcphdr_skb->data; 4430 memset(tcphdr_skb->data, 0, tcphdr_skb->len); 4431#endif 4432 4433 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); 4434 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); 4435 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); 4436 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); 4437 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 4438 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 4439 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 4440 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 4441 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 4442 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); 4443 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); 4444 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 4445 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); 4446 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); 4447 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); 4448 return (0); 4449} 4450 4451