cxgb_cpl_io.c revision 183292
175295Sdes/************************************************************************** 2230132Suqs 375295SdesCopyright (c) 2007-2008, Chelsio Inc. 475295SdesAll rights reserved. 575295Sdes 675295SdesRedistribution and use in source and binary forms, with or without 775295Sdesmodification, are permitted provided that the following conditions are met: 875295Sdes 975295Sdes 1. Redistributions of source code must retain the above copyright notice, 1075295Sdes this list of conditions and the following disclaimer. 1175295Sdes 1275295Sdes 2. Neither the name of the Chelsio Corporation nor the names of its 1375295Sdes contributors may be used to endorse or promote products derived from 1475295Sdes this software without specific prior written permission. 1575295Sdes 1675295SdesTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 1775295SdesAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1875295SdesIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1975295SdesARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 2075295SdesLIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 2175295SdesCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 2275295SdesSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 2375295SdesINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 2475295SdesCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 2575295SdesARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 2675295SdesPOSSIBILITY OF SUCH DAMAGE. 2775295Sdes 2875295Sdes***************************************************************************/ 29143592Sdes 30143592Sdes#include <sys/cdefs.h> 31143592Sdes__FBSDID("$FreeBSD: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c 183292 2008-09-23 03:16:54Z kmacy $"); 32143592Sdes 33143592Sdes#include <sys/param.h> 3475295Sdes#include <sys/systm.h> 3575295Sdes#include <sys/fcntl.h> 3675295Sdes#include <sys/kernel.h> 3784811Sjhb#include <sys/limits.h> 3875295Sdes#include <sys/ktr.h> 3975295Sdes#include <sys/lock.h> 4075295Sdes#include <sys/mbuf.h> 4177965Sdes#include <sys/mutex.h> 4275295Sdes#include <sys/sockstate.h> 4375295Sdes#include <sys/sockopt.h> 4475295Sdes#include <sys/socket.h> 4575295Sdes#include <sys/sockbuf.h> 4675295Sdes#include <sys/sysctl.h> 4775295Sdes#include <sys/syslog.h> 4875295Sdes#include <sys/protosw.h> 4975295Sdes#include <sys/priv.h> 5085128Sdes 5185128Sdes#if __FreeBSD_version >= 800044 5275295Sdes#include <sys/vimage.h> 5375295Sdes#else 5475295Sdes#define V_tcp_do_autosndbuf tcp_do_autosndbuf 55168764Sdes#define V_tcp_autosndbuf_max tcp_autosndbuf_max 56168764Sdes#define V_tcp_do_rfc1323 tcp_do_rfc1323 57168764Sdes#define V_tcp_do_autorcvbuf tcp_do_autorcvbuf 58168764Sdes#define V_tcp_autorcvbuf_max tcp_autorcvbuf_max 5985940Sdes#define V_tcpstat tcpstat 6085940Sdes#endif 6185940Sdes 6285940Sdes#include <net/if.h> 6375295Sdes#include <net/route.h> 64168764Sdes 65168764Sdes#include <netinet/in.h> 66168764Sdes#include <netinet/in_pcb.h> 67168764Sdes#include <netinet/in_systm.h> 68168764Sdes#include <netinet/in_var.h> 69168764Sdes 70168764Sdes 71168764Sdes#include <cxgb_osdep.h> 72168764Sdes#include <sys/mbufq.h> 73168764Sdes 74184205Sdes#include <netinet/ip.h> 75168764Sdes#include <netinet/tcp_var.h> 76168764Sdes#include <netinet/tcp_fsm.h> 77168764Sdes#include <netinet/tcp_offload.h> 78168764Sdes#include <netinet/tcp_seq.h> 79168764Sdes#include <netinet/tcp_syncache.h> 80168764Sdes#include <netinet/tcp_timer.h> 81168764Sdes#include <net/route.h> 82168764Sdes 83168764Sdes#include <t3cdev.h> 8485128Sdes#include <common/cxgb_firmware_exports.h> 8585128Sdes#include <common/cxgb_t3_cpl.h> 86168764Sdes#include <common/cxgb_tcb.h> 87168764Sdes#include <common/cxgb_ctl_defs.h> 8885128Sdes#include <cxgb_offload.h> 89168764Sdes#include <vm/vm.h> 90168764Sdes#include <vm/pmap.h> 91168764Sdes#include <machine/bus.h> 92168764Sdes#include <sys/mvec.h> 9385128Sdes#include <ulp/toecore/cxgb_toedev.h> 9487599Sobrien#include <ulp/tom/cxgb_defs.h> 95168764Sdes#include <ulp/tom/cxgb_tom.h> 96168764Sdes#include <ulp/tom/cxgb_t3_ddp.h> 9785128Sdes#include <ulp/tom/cxgb_toepcb.h> 9887599Sobrien#include <ulp/tom/cxgb_tcp.h> 9985128Sdes#include <ulp/tom/cxgb_tcp_offload.h> 10085128Sdes 10185128Sdes/* 10287599Sobrien * For ULP connections HW may add headers, e.g., for digests, that aren't part 10385128Sdes * of the messages sent by the host but that are part of the TCP payload and 104168764Sdes * therefore consume TCP sequence space. Tx connection parameters that 105168764Sdes * operate in TCP sequence space are affected by the HW additions and need to 106168764Sdes * compensate for them to accurately track TCP sequence numbers. This array 107168764Sdes * contains the compensating extra lengths for ULP packets. It is indexed by 108168764Sdes * a packet's ULP submode. 109168764Sdes */ 110168764Sdesconst unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; 111168764Sdes 112168764Sdes#ifdef notyet 113168764Sdes/* 114168764Sdes * This sk_buff holds a fake header-only TCP segment that we use whenever we 115168764Sdes * need to exploit SW TCP functionality that expects TCP headers, such as 116168764Sdes * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple 117168764Sdes * CPUs without locking. 11897940Sdes */ 119168764Sdesstatic struct mbuf *tcphdr_mbuf __read_mostly; 120168764Sdes#endif 121168764Sdes 122168764Sdes/* 123168764Sdes * Size of WRs in bytes. Note that we assume all devices we are handling have 124103314Snjl * the same WR size. 125168764Sdes */ 126168764Sdesstatic unsigned int wrlen __read_mostly; 127168764Sdes 12885128Sdes/* 12985128Sdes * The number of WRs needed for an skb depends on the number of page fragments 13085128Sdes * in the skb and whether it has any payload in its main body. This maps the 131168764Sdes * length of the gather list represented by an skb into the # of necessary WRs. 13285128Sdes */ 133168764Sdesstatic unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly; 134168764Sdes 13585128Sdes/* 136168764Sdes * Max receive window supported by HW in bytes. Only a small part of it can 137168764Sdes * be set through option0, the rest needs to be set through RX_DATA_ACK. 13897940Sdes */ 139168764Sdes#define MAX_RCV_WND ((1U << 27) - 1) 140168764Sdes 141168764Sdes/* 14297940Sdes * Min receive window. We want it to be large enough to accommodate receive 143168764Sdes * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. 144168764Sdes */ 145168764Sdes#define MIN_RCV_WND (24 * 1024U) 146168764Sdes#define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS) 147168764Sdes 148168764Sdes#define VALIDATE_SEQ 0 149168764Sdes#define VALIDATE_SOCK(so) 150168764Sdes#define DEBUG_WR 0 15185128Sdes 152168764Sdes#define TCP_TIMEWAIT 1 153168764Sdes#define TCP_CLOSE 2 154168764Sdes#define TCP_DROP 3 15597940Sdes 156168764Sdesextern int tcp_do_autorcvbuf; 157168764Sdesextern int tcp_do_autosndbuf; 158168764Sdesextern int tcp_autorcvbuf_max; 159168764Sdesextern int tcp_autosndbuf_max; 160168764Sdes 161168764Sdesstatic void t3_send_reset(struct toepcb *toep); 162168764Sdesstatic void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status); 16397940Sdesstatic inline void free_atid(struct t3cdev *cdev, unsigned int tid); 164168764Sdesstatic void handle_syncache_event(int event, void *arg); 165168764Sdes 166168764Sdesstatic inline void 167168764SdesSBAPPEND(struct sockbuf *sb, struct mbuf *n) 16885128Sdes{ 16985128Sdes struct mbuf *m; 17085128Sdes 17185128Sdes m = sb->sb_mb; 17285128Sdes while (m) { 17385128Sdes KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || 174123248Sdes !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", 175167482Sdes !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); 176167482Sdes KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 17785128Sdes m->m_next, m->m_nextpkt, m->m_flags)); 178168764Sdes m = m->m_next; 17985128Sdes } 180168764Sdes m = n; 181168764Sdes while (m) { 182168764Sdes KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || 183168764Sdes !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", 184168764Sdes !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len)); 185168764Sdes KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 186168764Sdes m->m_next, m->m_nextpkt, m->m_flags)); 187168764Sdes m = m->m_next; 18885128Sdes } 189168764Sdes KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set")); 19085128Sdes sbappendstream_locked(sb, n); 19185128Sdes m = sb->sb_mb; 19285128Sdes 19385128Sdes while (m) { 19485128Sdes KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x", 19585128Sdes m->m_next, m->m_nextpkt, m->m_flags)); 196123248Sdes m = m->m_next; 197167482Sdes } 198167482Sdes} 19985128Sdes 200168764Sdesstatic inline int 20185128Sdesis_t3a(const struct toedev *dev) 202168764Sdes{ 203168764Sdes return (dev->tod_ttid == TOE_ID_CHELSIO_T3); 204168764Sdes} 205168764Sdes 206168764Sdesstatic void 207168764Sdesdump_toepcb(struct toepcb *toep) 208168764Sdes{ 20985128Sdes DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n", 210168764Sdes toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode, 21185128Sdes toep->tp_mtu_idx, toep->tp_tid); 21285128Sdes 21385128Sdes DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n", 21485128Sdes toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, 21585128Sdes toep->tp_mss_clamp, toep->tp_flags); 21685128Sdes} 217123248Sdes 218168387Sdes#ifndef RTALLOC2_DEFINED 219167482Sdesstatic struct rtentry * 22085128Sdesrtalloc2(struct sockaddr *dst, int report, u_long ignflags) 221168764Sdes{ 22285128Sdes struct rtentry *rt = NULL; 223168764Sdes 224168764Sdes if ((rt = rtalloc1(dst, report, ignflags)) != NULL) 225168764Sdes RT_UNLOCK(rt); 226168764Sdes 227168764Sdes return (rt); 228168764Sdes} 229168764Sdes#endif 230168764Sdes 231168764Sdes/* 23285128Sdes * Determine whether to send a CPL message now or defer it. A message is 23385128Sdes * deferred if the connection is in SYN_SENT since we don't know the TID yet. 23485128Sdes * For connections in other states the message is sent immediately. 235123248Sdes * If through_l2t is set the message is subject to ARP processing, otherwise 236123248Sdes * it is sent directly. 237123248Sdes */ 238123248Sdesstatic inline void 239123248Sdessend_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t) 240168764Sdes{ 241123248Sdes struct tcpcb *tp = toep->tp_tp; 242168764Sdes 243168764Sdes if (__predict_false(tp->t_state == TCPS_SYN_SENT)) { 244168764Sdes inp_wlock(tp->t_inpcb); 245168720Sdes mbufq_tail(&toep->out_of_order_queue, m); // defer 246168764Sdes inp_wunlock(tp->t_inpcb); 247168764Sdes } else if (through_l2t) 248123248Sdes l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T 249123248Sdes else 250123248Sdes cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly 251168764Sdes} 252168764Sdes 25385128Sdesstatic inline unsigned int 25485128Sdesmkprio(unsigned int cntrl, const struct toepcb *toep) 255168764Sdes{ 25685128Sdes return (cntrl); 257168764Sdes} 25897940Sdes 259168764Sdes/* 26087599Sobrien * Populate a TID_RELEASE WR. The skb must be already propely sized. 261168764Sdes */ 26287599Sobrienstatic inline void 26385128Sdesmk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid) 264168764Sdes{ 265168764Sdes struct cpl_tid_release *req; 266168764Sdes 26785128Sdes m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep)); 268168764Sdes m->m_pkthdr.len = m->m_len = sizeof(*req); 269168764Sdes req = mtod(m, struct cpl_tid_release *); 270168764Sdes req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 271168764Sdes req->wr.wr_lo = 0; 272168764Sdes OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); 273168764Sdes} 274168764Sdes 275168764Sdesstatic inline void 276168764Sdesmake_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail) 277168764Sdes{ 278168764Sdes struct tcpcb *tp = so_sototcpcb(so); 27985128Sdes struct toepcb *toep = tp->t_toe; 280168764Sdes struct tx_data_wr *req; 28185128Sdes struct sockbuf *snd; 28285128Sdes 283168764Sdes inp_lock_assert(tp->t_inpcb); 284168764Sdes snd = so_sockbuf_snd(so); 285168764Sdes 286167482Sdes req = mtod(m, struct tx_data_wr *); 287168764Sdes m->m_len = sizeof(*req); 288168764Sdes req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); 289168387Sdes req->wr_lo = htonl(V_WR_TID(toep->tp_tid)); 290168764Sdes /* len includes the length of any HW ULP additions */ 291168764Sdes req->len = htonl(len); 292168764Sdes req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx)); 293184205Sdes /* V_TX_ULP_SUBMODE sets both the mode and submode */ 29485128Sdes req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) | 29585128Sdes V_TX_URG(/* skb_urgent(skb) */ 0 ) | 29685128Sdes V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) && 29785128Sdes (tail ? 0 : 1)))); 29885128Sdes req->sndseq = htonl(tp->snd_nxt); 29975295Sdes if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) { 30075295Sdes req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 30175295Sdes V_TX_CPU_IDX(toep->tp_qset)); 302191990Sattilio 30375295Sdes /* Sendbuffer is in units of 32KB. 30475295Sdes */ 30597940Sdes if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE) 30675295Sdes req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15)); 30775295Sdes else { 30897940Sdes req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15)); 309168637Sdes } 31075295Sdes 311168637Sdes toep->tp_flags |= TP_DATASENT; 312172697Salfred } 31375295Sdes} 31475295Sdes 31575295Sdes#define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */ 316138495Sphk 31775295Sdesint 31875295Sdest3_push_frames(struct socket *so, int req_completion) 31975295Sdes{ 32075295Sdes struct tcpcb *tp = so_sototcpcb(so); 32175295Sdes struct toepcb *toep = tp->t_toe; 32275295Sdes 32375295Sdes struct mbuf *tail, *m0, *last; 32475295Sdes struct t3cdev *cdev; 32575295Sdes struct tom_data *d; 32675295Sdes int state, bytes, count, total_bytes; 32775295Sdes bus_dma_segment_t segs[TX_MAX_SEGS], *segp; 32875295Sdes struct sockbuf *snd; 329168764Sdes 330158611Skbyanc if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) { 331158611Skbyanc DPRINTF("tcp state=%d\n", tp->t_state); 332230249Smckusick return (0); 333158611Skbyanc } 334168764Sdes 335168764Sdes state = so_state_get(so); 336168764Sdes 337168764Sdes if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) { 338158611Skbyanc DPRINTF("disconnecting\n"); 339158611Skbyanc 340158611Skbyanc return (0); 34175295Sdes } 34275295Sdes 34375295Sdes inp_lock_assert(tp->t_inpcb); 344191990Sattilio 34575295Sdes snd = so_sockbuf_snd(so); 34675295Sdes sockbuf_lock(snd); 34775295Sdes 348191990Sattilio d = TOM_DATA(toep->tp_toedev); 349191990Sattilio cdev = d->cdev; 35075295Sdes 35175295Sdes last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb; 35275295Sdes 35375295Sdes total_bytes = 0; 35475295Sdes DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n", 35575295Sdes toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last); 35675295Sdes 357191990Sattilio if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) { 35875295Sdes KASSERT(tail, ("sbdrop error")); 35975295Sdes last = tail = tail->m_next; 36075295Sdes } 36175295Sdes 362168764Sdes if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) { 36375295Sdes DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail); 36475295Sdes sockbuf_unlock(snd); 36575295Sdes 36675295Sdes return (0); 36775295Sdes } 36875295Sdes 369191990Sattilio toep->tp_m_last = NULL; 37075295Sdes while (toep->tp_wr_avail && (tail != NULL)) { 371138495Sphk count = bytes = 0; 37275295Sdes segp = segs; 37375295Sdes if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) { 37475295Sdes sockbuf_unlock(snd); 37575295Sdes return (0); 37675295Sdes } 37775295Sdes /* 37875295Sdes * If the data in tail fits as in-line, then 37975295Sdes * make an immediate data wr. 38075295Sdes */ 38185128Sdes if (tail->m_len <= IMM_LEN) { 38285128Sdes count = 1; 38385128Sdes bytes = tail->m_len; 384168720Sdes last = tail; 38597940Sdes tail = tail->m_next; 386168764Sdes m_set_sgl(m0, NULL); 387168764Sdes m_set_sgllen(m0, 0); 38885128Sdes make_tx_data_wr(so, m0, bytes, tail); 389168764Sdes m_append(m0, bytes, mtod(last, caddr_t)); 39085128Sdes KASSERT(!m0->m_next, ("bad append")); 391168764Sdes } else { 392168764Sdes while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail) 39385128Sdes && (tail != NULL) && (count < TX_MAX_SEGS-1)) { 39485128Sdes bytes += tail->m_len; 39585128Sdes last = tail; 39685128Sdes count++; 39785128Sdes /* 39885128Sdes * technically an abuse to be using this for a VA 39985128Sdes * but less gross than defining my own structure 40085128Sdes * or calling pmap_kextract from here :-| 40197940Sdes */ 40284383Sdes segp->ds_addr = (bus_addr_t)tail->m_data; 40384383Sdes segp->ds_len = tail->m_len; 40475295Sdes DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n", 40575295Sdes count, mbuf_wrs[count], tail->m_data, tail->m_len); 40675295Sdes segp++; 40775295Sdes tail = tail->m_next; 40875295Sdes } 40975295Sdes DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n", 41075295Sdes toep->tp_wr_avail, count, mbuf_wrs[count], tail); 41175295Sdes 41275295Sdes m_set_sgl(m0, segs); 41385128Sdes m_set_sgllen(m0, count); 41497940Sdes make_tx_data_wr(so, m0, bytes, tail); 415168720Sdes } 416168720Sdes m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep)); 41785128Sdes 41885128Sdes if (tail) { 419168637Sdes snd->sb_sndptr = tail; 42084383Sdes toep->tp_m_last = NULL; 42184383Sdes } else 42285128Sdes toep->tp_m_last = snd->sb_sndptr = last; 42385128Sdes 42475295Sdes 42575295Sdes DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last); 42675295Sdes 42775295Sdes snd->sb_sndptroff += bytes; 42875295Sdes total_bytes += bytes; 42975295Sdes toep->tp_write_seq += bytes; 43075295Sdes CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d" 43175295Sdes " tail=%p sndptr=%p sndptroff=%d", 43275295Sdes toep->tp_wr_avail, count, mbuf_wrs[count], 43375295Sdes tail, snd->sb_sndptr, snd->sb_sndptroff); 43475295Sdes if (tail) 43575295Sdes CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d" 43675295Sdes " tp_m_last=%p tailbuf=%p snd_una=0x%08x", 43775295Sdes total_bytes, toep->tp_m_last, tail->m_data, 43875295Sdes tp->snd_una); 43975295Sdes else 44075295Sdes CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d" 441132199Sphk " tp_m_last=%p snd_una=0x%08x", 44275295Sdes total_bytes, toep->tp_m_last, tp->snd_una); 44375295Sdes 44475295Sdes 44575295Sdes#ifdef KTR 44675295Sdes{ 44775295Sdes int i; 44875295Sdes 44975295Sdes i = 0; 45075295Sdes while (i < count && m_get_sgllen(m0)) { 45175295Sdes if ((count - i) >= 3) { 45275295Sdes CTR6(KTR_TOM, 45375295Sdes "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" 45475295Sdes " len=%d pa=0x%zx len=%d", 45575295Sdes segs[i].ds_addr, segs[i].ds_len, 45686969Sdes segs[i + 1].ds_addr, segs[i + 1].ds_len, 457 segs[i + 2].ds_addr, segs[i + 2].ds_len); 458 i += 3; 459 } else if ((count - i) == 2) { 460 CTR4(KTR_TOM, 461 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx" 462 " len=%d", 463 segs[i].ds_addr, segs[i].ds_len, 464 segs[i + 1].ds_addr, segs[i + 1].ds_len); 465 i += 2; 466 } else { 467 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d", 468 segs[i].ds_addr, segs[i].ds_len); 469 i++; 470 } 471 472 } 473} 474#endif 475 /* 476 * remember credits used 477 */ 478 m0->m_pkthdr.csum_data = mbuf_wrs[count]; 479 m0->m_pkthdr.len = bytes; 480 toep->tp_wr_avail -= mbuf_wrs[count]; 481 toep->tp_wr_unacked += mbuf_wrs[count]; 482 483 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) || 484 toep->tp_wr_unacked >= toep->tp_wr_max / 2) { 485 struct work_request_hdr *wr = cplhdr(m0); 486 487 wr->wr_hi |= htonl(F_WR_COMPL); 488 toep->tp_wr_unacked = 0; 489 } 490 KASSERT((m0->m_pkthdr.csum_data > 0) && 491 (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d", 492 m0->m_pkthdr.csum_data)); 493 m0->m_type = MT_DONTFREE; 494 enqueue_wr(toep, m0); 495 DPRINTF("sending offload tx with %d bytes in %d segments\n", 496 bytes, count); 497 l2t_send(cdev, m0, toep->tp_l2t); 498 } 499 sockbuf_unlock(snd); 500 return (total_bytes); 501} 502 503/* 504 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail 505 * under any circumstances. We take the easy way out and always queue the 506 * message to the write_queue. We can optimize the case where the queue is 507 * already empty though the optimization is probably not worth it. 508 */ 509static void 510close_conn(struct socket *so) 511{ 512 struct mbuf *m; 513 struct cpl_close_con_req *req; 514 struct tom_data *d; 515 struct inpcb *inp = so_sotoinpcb(so); 516 struct tcpcb *tp; 517 struct toepcb *toep; 518 unsigned int tid; 519 520 521 inp_wlock(inp); 522 tp = so_sototcpcb(so); 523 toep = tp->t_toe; 524 525 if (tp->t_state != TCPS_SYN_SENT) 526 t3_push_frames(so, 1); 527 528 if (toep->tp_flags & TP_FIN_SENT) { 529 inp_wunlock(inp); 530 return; 531 } 532 533 tid = toep->tp_tid; 534 535 d = TOM_DATA(toep->tp_toedev); 536 537 m = m_gethdr_nofail(sizeof(*req)); 538 m_set_priority(m, CPL_PRIORITY_DATA); 539 m_set_sgl(m, NULL); 540 m_set_sgllen(m, 0); 541 542 toep->tp_flags |= TP_FIN_SENT; 543 req = mtod(m, struct cpl_close_con_req *); 544 545 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON)); 546 req->wr.wr_lo = htonl(V_WR_TID(tid)); 547 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); 548 req->rsvd = 0; 549 inp_wunlock(inp); 550 /* 551 * XXX - need to defer shutdown while there is still data in the queue 552 * 553 */ 554 CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid); 555 cxgb_ofld_send(d->cdev, m); 556 557} 558 559/* 560 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant 561 * and send it along. 562 */ 563static void 564abort_arp_failure(struct t3cdev *cdev, struct mbuf *m) 565{ 566 struct cpl_abort_req *req = cplhdr(m); 567 568 req->cmd = CPL_ABORT_NO_RST; 569 cxgb_ofld_send(cdev, m); 570} 571 572/* 573 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are 574 * permitted to return without sending the message in case we cannot allocate 575 * an sk_buff. Returns the number of credits sent. 576 */ 577uint32_t 578t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail) 579{ 580 struct mbuf *m; 581 struct cpl_rx_data_ack *req; 582 struct toepcb *toep = tp->t_toe; 583 struct toedev *tdev = toep->tp_toedev; 584 585 m = m_gethdr_nofail(sizeof(*req)); 586 587 DPRINTF("returning %u credits to HW\n", credits); 588 589 req = mtod(m, struct cpl_rx_data_ack *); 590 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 591 req->wr.wr_lo = 0; 592 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 593 req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); 594 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep)); 595 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 596 return (credits); 597} 598 599/* 600 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled. 601 * This is only used in DDP mode, so we take the opportunity to also set the 602 * DACK mode and flush any Rx credits. 603 */ 604void 605t3_send_rx_modulate(struct toepcb *toep) 606{ 607 struct mbuf *m; 608 struct cpl_rx_data_ack *req; 609 610 m = m_gethdr_nofail(sizeof(*req)); 611 612 req = mtod(m, struct cpl_rx_data_ack *); 613 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 614 req->wr.wr_lo = 0; 615 m->m_pkthdr.len = m->m_len = sizeof(*req); 616 617 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid)); 618 req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | 619 V_RX_DACK_MODE(1) | 620 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup)); 621 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 622 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 623 toep->tp_rcv_wup = toep->tp_copied_seq; 624} 625 626/* 627 * Handle receipt of an urgent pointer. 628 */ 629static void 630handle_urg_ptr(struct socket *so, uint32_t urg_seq) 631{ 632#ifdef URGENT_DATA_SUPPORTED 633 struct tcpcb *tp = so_sototcpcb(so); 634 635 urg_seq--; /* initially points past the urgent data, per BSD */ 636 637 if (tp->urg_data && !after(urg_seq, tp->urg_seq)) 638 return; /* duplicate pointer */ 639 sk_send_sigurg(sk); 640 if (tp->urg_seq == tp->copied_seq && tp->urg_data && 641 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { 642 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 643 644 tp->copied_seq++; 645 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len) 646 tom_eat_skb(sk, skb, 0); 647 } 648 tp->urg_data = TCP_URG_NOTYET; 649 tp->urg_seq = urg_seq; 650#endif 651} 652 653/* 654 * Returns true if a socket cannot accept new Rx data. 655 */ 656static inline int 657so_no_receive(const struct socket *so) 658{ 659 return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING)); 660} 661 662/* 663 * Process an urgent data notification. 664 */ 665static void 666rx_urg_notify(struct toepcb *toep, struct mbuf *m) 667{ 668 struct cpl_rx_urg_notify *hdr = cplhdr(m); 669 struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 670 671 VALIDATE_SOCK(so); 672 673 if (!so_no_receive(so)) 674 handle_urg_ptr(so, ntohl(hdr->seq)); 675 676 m_freem(m); 677} 678 679/* 680 * Handler for RX_URG_NOTIFY CPL messages. 681 */ 682static int 683do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx) 684{ 685 struct toepcb *toep = (struct toepcb *)ctx; 686 687 rx_urg_notify(toep, m); 688 return (0); 689} 690 691static __inline int 692is_delack_mode_valid(struct toedev *dev, struct toepcb *toep) 693{ 694 return (toep->tp_ulp_mode || 695 (toep->tp_ulp_mode == ULP_MODE_TCPDDP && 696 dev->tod_ttid >= TOE_ID_CHELSIO_T3)); 697} 698 699/* 700 * Set of states for which we should return RX credits. 701 */ 702#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2) 703 704/* 705 * Called after some received data has been read. It returns RX credits 706 * to the HW for the amount of data processed. 707 */ 708void 709t3_cleanup_rbuf(struct tcpcb *tp, int copied) 710{ 711 struct toepcb *toep = tp->t_toe; 712 struct socket *so; 713 struct toedev *dev; 714 int dack_mode, must_send, read; 715 u32 thres, credits, dack = 0; 716 struct sockbuf *rcv; 717 718 so = inp_inpcbtosocket(tp->t_inpcb); 719 rcv = so_sockbuf_rcv(so); 720 721 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) || 722 (tp->t_state == TCPS_FIN_WAIT_2))) { 723 if (copied) { 724 sockbuf_lock(rcv); 725 toep->tp_copied_seq += copied; 726 sockbuf_unlock(rcv); 727 } 728 729 return; 730 } 731 732 inp_lock_assert(tp->t_inpcb); 733 734 sockbuf_lock(rcv); 735 if (copied) 736 toep->tp_copied_seq += copied; 737 else { 738 read = toep->tp_enqueued_bytes - rcv->sb_cc; 739 toep->tp_copied_seq += read; 740 } 741 credits = toep->tp_copied_seq - toep->tp_rcv_wup; 742 toep->tp_enqueued_bytes = rcv->sb_cc; 743 sockbuf_unlock(rcv); 744 745 if (credits > rcv->sb_mbmax) { 746 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n", 747 toep->tp_copied_seq, toep->tp_rcv_wup, credits); 748 credits = rcv->sb_mbmax; 749 } 750 751 752 /* 753 * XXX this won't accurately reflect credit return - we need 754 * to look at the difference between the amount that has been 755 * put in the recv sockbuf and what is there now 756 */ 757 758 if (__predict_false(!credits)) 759 return; 760 761 dev = toep->tp_toedev; 762 thres = TOM_TUNABLE(dev, rx_credit_thres); 763 764 if (__predict_false(thres == 0)) 765 return; 766 767 if (is_delack_mode_valid(dev, toep)) { 768 dack_mode = TOM_TUNABLE(dev, delack); 769 if (__predict_false(dack_mode != toep->tp_delack_mode)) { 770 u32 r = tp->rcv_nxt - toep->tp_delack_seq; 771 772 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp) 773 dack = F_RX_DACK_CHANGE | 774 V_RX_DACK_MODE(dack_mode); 775 } 776 } else 777 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1); 778 779 /* 780 * For coalescing to work effectively ensure the receive window has 781 * at least 16KB left. 782 */ 783 must_send = credits + 16384 >= tp->rcv_wnd; 784 785 if (must_send || credits >= thres) 786 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send); 787} 788 789static int 790cxgb_toe_disconnect(struct tcpcb *tp) 791{ 792 struct socket *so; 793 794 DPRINTF("cxgb_toe_disconnect\n"); 795 796 so = inp_inpcbtosocket(tp->t_inpcb); 797 close_conn(so); 798 return (0); 799} 800 801static int 802cxgb_toe_reset(struct tcpcb *tp) 803{ 804 struct toepcb *toep = tp->t_toe; 805 806 t3_send_reset(toep); 807 808 /* 809 * unhook from socket 810 */ 811 tp->t_flags &= ~TF_TOE; 812 toep->tp_tp = NULL; 813 tp->t_toe = NULL; 814 return (0); 815} 816 817static int 818cxgb_toe_send(struct tcpcb *tp) 819{ 820 struct socket *so; 821 822 DPRINTF("cxgb_toe_send\n"); 823 dump_toepcb(tp->t_toe); 824 825 so = inp_inpcbtosocket(tp->t_inpcb); 826 t3_push_frames(so, 1); 827 return (0); 828} 829 830static int 831cxgb_toe_rcvd(struct tcpcb *tp) 832{ 833 834 inp_lock_assert(tp->t_inpcb); 835 836 t3_cleanup_rbuf(tp, 0); 837 838 return (0); 839} 840 841static void 842cxgb_toe_detach(struct tcpcb *tp) 843{ 844 struct toepcb *toep; 845 846 /* 847 * XXX how do we handle teardown in the SYN_SENT state? 848 * 849 */ 850 inp_lock_assert(tp->t_inpcb); 851 toep = tp->t_toe; 852 toep->tp_tp = NULL; 853 854 /* 855 * unhook from socket 856 */ 857 tp->t_flags &= ~TF_TOE; 858 tp->t_toe = NULL; 859} 860 861 862static struct toe_usrreqs cxgb_toe_usrreqs = { 863 .tu_disconnect = cxgb_toe_disconnect, 864 .tu_reset = cxgb_toe_reset, 865 .tu_send = cxgb_toe_send, 866 .tu_rcvd = cxgb_toe_rcvd, 867 .tu_detach = cxgb_toe_detach, 868 .tu_detach = cxgb_toe_detach, 869 .tu_syncache_event = handle_syncache_event, 870}; 871 872 873static void 874__set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word, 875 uint64_t mask, uint64_t val, int no_reply) 876{ 877 struct cpl_set_tcb_field *req; 878 879 CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", 880 toep->tp_tid, word, mask, val); 881 882 req = mtod(m, struct cpl_set_tcb_field *); 883 m->m_pkthdr.len = m->m_len = sizeof(*req); 884 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 885 req->wr.wr_lo = 0; 886 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid)); 887 req->reply = V_NO_REPLY(no_reply); 888 req->cpu_idx = 0; 889 req->word = htons(word); 890 req->mask = htobe64(mask); 891 req->val = htobe64(val); 892 893 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 894 send_or_defer(toep, m, 0); 895} 896 897static void 898t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val) 899{ 900 struct mbuf *m; 901 struct tcpcb *tp = toep->tp_tp; 902 903 if (toep == NULL) 904 return; 905 906 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) { 907 printf("not seting field\n"); 908 return; 909 } 910 911 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field)); 912 913 __set_tcb_field(toep, m, word, mask, val, 1); 914} 915 916/* 917 * Set one of the t_flags bits in the TCB. 918 */ 919static void 920set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val) 921{ 922 923 t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos); 924} 925 926/* 927 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. 928 */ 929static void 930t3_set_nagle(struct toepcb *toep) 931{ 932 struct tcpcb *tp = toep->tp_tp; 933 934 set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY)); 935} 936 937/* 938 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. 939 */ 940void 941t3_set_keepalive(struct toepcb *toep, int on_off) 942{ 943 944 set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off); 945} 946 947void 948t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off) 949{ 950 set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off); 951} 952 953void 954t3_set_dack_mss(struct toepcb *toep, int on_off) 955{ 956 957 set_tcb_tflag(toep, S_TF_DACK_MSS, on_off); 958} 959 960/* 961 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. 962 */ 963static void 964t3_set_tos(struct toepcb *toep) 965{ 966 int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb); 967 968 t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), 969 V_TCB_TOS(tos)); 970} 971 972 973/* 974 * In DDP mode, TP fails to schedule a timer to push RX data to the host when 975 * DDP is disabled (data is delivered to freelist). [Note that, the peer should 976 * set the PSH bit in the last segment, which would trigger delivery.] 977 * We work around the issue by setting a DDP buffer in a partial placed state, 978 * which guarantees that TP will schedule a timer. 979 */ 980#define TP_DDP_TIMER_WORKAROUND_MASK\ 981 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ 982 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ 983 V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) 984#define TP_DDP_TIMER_WORKAROUND_VAL\ 985 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ 986 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\ 987 32)) 988 989static void 990t3_enable_ddp(struct toepcb *toep, int on) 991{ 992 if (on) { 993 994 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), 995 V_TF_DDP_OFF(0)); 996 } else 997 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, 998 V_TF_DDP_OFF(1) | 999 TP_DDP_TIMER_WORKAROUND_MASK, 1000 V_TF_DDP_OFF(1) | 1001 TP_DDP_TIMER_WORKAROUND_VAL); 1002 1003} 1004 1005void 1006t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color) 1007{ 1008 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx, 1009 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 1010 tag_color); 1011} 1012 1013void 1014t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset, 1015 unsigned int len) 1016{ 1017 if (buf_idx == 0) 1018 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET, 1019 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 1020 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 1021 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) | 1022 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 1023 else 1024 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET, 1025 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 1026 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), 1027 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) | 1028 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32)); 1029} 1030 1031static int 1032t3_set_cong_control(struct socket *so, const char *name) 1033{ 1034#ifdef CONGESTION_CONTROL_SUPPORTED 1035 int cong_algo; 1036 1037 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) 1038 if (!strcmp(name, t3_cong_ops[cong_algo].name)) 1039 break; 1040 1041 if (cong_algo >= ARRAY_SIZE(t3_cong_ops)) 1042 return -EINVAL; 1043#endif 1044 return 0; 1045} 1046 1047int 1048t3_get_tcb(struct toepcb *toep) 1049{ 1050 struct cpl_get_tcb *req; 1051 struct tcpcb *tp = toep->tp_tp; 1052 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA); 1053 1054 if (!m) 1055 return (ENOMEM); 1056 1057 inp_lock_assert(tp->t_inpcb); 1058 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 1059 req = mtod(m, struct cpl_get_tcb *); 1060 m->m_pkthdr.len = m->m_len = sizeof(*req); 1061 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1062 req->wr.wr_lo = 0; 1063 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid)); 1064 req->cpuno = htons(toep->tp_qset); 1065 req->rsvd = 0; 1066 if (tp->t_state == TCPS_SYN_SENT) 1067 mbufq_tail(&toep->out_of_order_queue, m); // defer 1068 else 1069 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 1070 return 0; 1071} 1072 1073static inline void 1074so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid) 1075{ 1076 1077 toepcb_hold(toep); 1078 1079 cxgb_insert_tid(d->cdev, d->client, toep, tid); 1080} 1081 1082/** 1083 * find_best_mtu - find the entry in the MTU table closest to an MTU 1084 * @d: TOM state 1085 * @mtu: the target MTU 1086 * 1087 * Returns the index of the value in the MTU table that is closest to but 1088 * does not exceed the target MTU. 1089 */ 1090static unsigned int 1091find_best_mtu(const struct t3c_data *d, unsigned short mtu) 1092{ 1093 int i = 0; 1094 1095 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) 1096 ++i; 1097 return (i); 1098} 1099 1100static unsigned int 1101select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu) 1102{ 1103 unsigned int idx; 1104 1105#ifdef notyet 1106 struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt; 1107#endif 1108 if (tp) { 1109 tp->t_maxseg = pmtu - 40; 1110 if (tp->t_maxseg < td->mtus[0] - 40) 1111 tp->t_maxseg = td->mtus[0] - 40; 1112 idx = find_best_mtu(td, tp->t_maxseg + 40); 1113 1114 tp->t_maxseg = td->mtus[idx] - 40; 1115 } else 1116 idx = find_best_mtu(td, pmtu); 1117 1118 return (idx); 1119} 1120 1121static inline void 1122free_atid(struct t3cdev *cdev, unsigned int tid) 1123{ 1124 struct toepcb *toep = cxgb_free_atid(cdev, tid); 1125 1126 if (toep) 1127 toepcb_release(toep); 1128} 1129 1130/* 1131 * Release resources held by an offload connection (TID, L2T entry, etc.) 1132 */ 1133static void 1134t3_release_offload_resources(struct toepcb *toep) 1135{ 1136 struct tcpcb *tp = toep->tp_tp; 1137 struct toedev *tdev = toep->tp_toedev; 1138 struct t3cdev *cdev; 1139 struct socket *so; 1140 unsigned int tid = toep->tp_tid; 1141 struct sockbuf *rcv; 1142 1143 CTR0(KTR_TOM, "t3_release_offload_resources"); 1144 1145 if (!tdev) 1146 return; 1147 1148 cdev = TOEP_T3C_DEV(toep); 1149 if (!cdev) 1150 return; 1151 1152 toep->tp_qset = 0; 1153 t3_release_ddp_resources(toep); 1154 1155#ifdef CTRL_SKB_CACHE 1156 kfree_skb(CTRL_SKB_CACHE(tp)); 1157 CTRL_SKB_CACHE(tp) = NULL; 1158#endif 1159 1160 if (toep->tp_wr_avail != toep->tp_wr_max) { 1161 purge_wr_queue(toep); 1162 reset_wr_list(toep); 1163 } 1164 1165 if (toep->tp_l2t) { 1166 l2t_release(L2DATA(cdev), toep->tp_l2t); 1167 toep->tp_l2t = NULL; 1168 } 1169 toep->tp_tp = NULL; 1170 if (tp) { 1171 inp_lock_assert(tp->t_inpcb); 1172 so = inp_inpcbtosocket(tp->t_inpcb); 1173 rcv = so_sockbuf_rcv(so); 1174 /* 1175 * cancel any offloaded reads 1176 * 1177 */ 1178 sockbuf_lock(rcv); 1179 tp->t_toe = NULL; 1180 tp->t_flags &= ~TF_TOE; 1181 if (toep->tp_ddp_state.user_ddp_pending) { 1182 t3_cancel_ubuf(toep, rcv); 1183 toep->tp_ddp_state.user_ddp_pending = 0; 1184 } 1185 so_sorwakeup_locked(so); 1186 1187 } 1188 1189 if (toep->tp_state == TCPS_SYN_SENT) { 1190 free_atid(cdev, tid); 1191#ifdef notyet 1192 __skb_queue_purge(&tp->out_of_order_queue); 1193#endif 1194 } else { // we have TID 1195 cxgb_remove_tid(cdev, toep, tid); 1196 toepcb_release(toep); 1197 } 1198#if 0 1199 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state); 1200#endif 1201} 1202 1203static void 1204install_offload_ops(struct socket *so) 1205{ 1206 struct tcpcb *tp = so_sototcpcb(so); 1207 1208 KASSERT(tp->t_toe != NULL, ("toepcb not set")); 1209 1210 t3_install_socket_ops(so); 1211 tp->t_flags |= TF_TOE; 1212 tp->t_tu = &cxgb_toe_usrreqs; 1213} 1214 1215/* 1216 * Determine the receive window scaling factor given a target max 1217 * receive window. 1218 */ 1219static __inline int 1220select_rcv_wscale(int space) 1221{ 1222 int wscale = 0; 1223 1224 if (space > MAX_RCV_WND) 1225 space = MAX_RCV_WND; 1226 1227 if (V_tcp_do_rfc1323) 1228 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; 1229 1230 return (wscale); 1231} 1232 1233/* 1234 * Determine the receive window size for a socket. 1235 */ 1236static unsigned long 1237select_rcv_wnd(struct toedev *dev, struct socket *so) 1238{ 1239 struct tom_data *d = TOM_DATA(dev); 1240 unsigned int wnd; 1241 unsigned int max_rcv_wnd; 1242 struct sockbuf *rcv; 1243 1244 rcv = so_sockbuf_rcv(so); 1245 1246 if (V_tcp_do_autorcvbuf) 1247 wnd = V_tcp_autorcvbuf_max; 1248 else 1249 wnd = rcv->sb_hiwat; 1250 1251 1252 1253 /* XXX 1254 * For receive coalescing to work effectively we need a receive window 1255 * that can accomodate a coalesced segment. 1256 */ 1257 if (wnd < MIN_RCV_WND) 1258 wnd = MIN_RCV_WND; 1259 1260 /* PR 5138 */ 1261 max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ? 1262 (uint32_t)d->rx_page_size * 23 : 1263 MAX_RCV_WND); 1264 1265 return min(wnd, max_rcv_wnd); 1266} 1267 1268/* 1269 * Assign offload parameters to some socket fields. This code is used by 1270 * both active and passive opens. 1271 */ 1272static inline void 1273init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid, 1274 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep) 1275{ 1276 struct tcpcb *tp = so_sototcpcb(so); 1277 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev); 1278 struct sockbuf *snd, *rcv; 1279 1280#ifdef notyet 1281 SOCK_LOCK_ASSERT(so); 1282#endif 1283 1284 snd = so_sockbuf_snd(so); 1285 rcv = so_sockbuf_rcv(so); 1286 1287 log(LOG_INFO, "initializing offload socket\n"); 1288 /* 1289 * We either need to fix push frames to work with sbcompress 1290 * or we need to add this 1291 */ 1292 snd->sb_flags |= SB_NOCOALESCE; 1293 rcv->sb_flags |= SB_NOCOALESCE; 1294 1295 tp->t_toe = toep; 1296 toep->tp_tp = tp; 1297 toep->tp_toedev = dev; 1298 1299 toep->tp_tid = tid; 1300 toep->tp_l2t = e; 1301 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs); 1302 toep->tp_wr_unacked = 0; 1303 toep->tp_delack_mode = 0; 1304 1305 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu); 1306 /* 1307 * XXX broken 1308 * 1309 */ 1310 tp->rcv_wnd = select_rcv_wnd(dev, so); 1311 1312 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) && 1313 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 1314 toep->tp_qset_idx = 0; 1315 1316 reset_wr_list(toep); 1317 DPRINTF("initialization done\n"); 1318} 1319 1320/* 1321 * The next two functions calculate the option 0 value for a socket. 1322 */ 1323static inline unsigned int 1324calc_opt0h(struct socket *so, int mtu_idx) 1325{ 1326 struct tcpcb *tp = so_sototcpcb(so); 1327 int wscale = select_rcv_wscale(tp->rcv_wnd); 1328 1329 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) | 1330 V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS | 1331 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx); 1332} 1333 1334static inline unsigned int 1335calc_opt0l(struct socket *so, int ulp_mode) 1336{ 1337 struct tcpcb *tp = so_sototcpcb(so); 1338 unsigned int val; 1339 1340 val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) | 1341 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ)); 1342 1343 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val); 1344 return (val); 1345} 1346 1347static inline unsigned int 1348calc_opt2(const struct socket *so, struct toedev *dev) 1349{ 1350 int flv_valid; 1351 1352 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1); 1353 1354 return (V_FLAVORS_VALID(flv_valid) | 1355 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0)); 1356} 1357 1358#if DEBUG_WR > 1 1359static int 1360count_pending_wrs(const struct toepcb *toep) 1361{ 1362 const struct mbuf *m; 1363 int n = 0; 1364 1365 wr_queue_walk(toep, m) 1366 n += m->m_pkthdr.csum_data; 1367 return (n); 1368} 1369#endif 1370 1371#if 0 1372(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1) 1373#endif 1374 1375static void 1376mk_act_open_req(struct socket *so, struct mbuf *m, 1377 unsigned int atid, const struct l2t_entry *e) 1378{ 1379 struct cpl_act_open_req *req; 1380 struct inpcb *inp = so_sotoinpcb(so); 1381 struct tcpcb *tp = inp_inpcbtotcpcb(inp); 1382 struct toepcb *toep = tp->t_toe; 1383 struct toedev *tdev = toep->tp_toedev; 1384 1385 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep)); 1386 1387 req = mtod(m, struct cpl_act_open_req *); 1388 m->m_pkthdr.len = m->m_len = sizeof(*req); 1389 1390 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 1391 req->wr.wr_lo = 0; 1392 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); 1393 inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port); 1394#if 0 1395 req->local_port = inp->inp_lport; 1396 req->peer_port = inp->inp_fport; 1397 memcpy(&req->local_ip, &inp->inp_laddr, 4); 1398 memcpy(&req->peer_ip, &inp->inp_faddr, 4); 1399#endif 1400 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) | 1401 V_TX_CHANNEL(e->smt_idx)); 1402 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode)); 1403 req->params = 0; 1404 req->opt2 = htonl(calc_opt2(so, tdev)); 1405} 1406 1407 1408/* 1409 * Convert an ACT_OPEN_RPL status to an errno. 1410 */ 1411static int 1412act_open_rpl_status_to_errno(int status) 1413{ 1414 switch (status) { 1415 case CPL_ERR_CONN_RESET: 1416 return (ECONNREFUSED); 1417 case CPL_ERR_ARP_MISS: 1418 return (EHOSTUNREACH); 1419 case CPL_ERR_CONN_TIMEDOUT: 1420 return (ETIMEDOUT); 1421 case CPL_ERR_TCAM_FULL: 1422 return (ENOMEM); 1423 case CPL_ERR_CONN_EXIST: 1424 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n"); 1425 return (EADDRINUSE); 1426 default: 1427 return (EIO); 1428 } 1429} 1430 1431static void 1432fail_act_open(struct toepcb *toep, int errno) 1433{ 1434 struct tcpcb *tp = toep->tp_tp; 1435 1436 t3_release_offload_resources(toep); 1437 if (tp) { 1438 inp_wunlock(tp->t_inpcb); 1439 tcp_offload_drop(tp, errno); 1440 } 1441 1442#ifdef notyet 1443 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 1444#endif 1445} 1446 1447/* 1448 * Handle active open failures. 1449 */ 1450static void 1451active_open_failed(struct toepcb *toep, struct mbuf *m) 1452{ 1453 struct cpl_act_open_rpl *rpl = cplhdr(m); 1454 struct inpcb *inp; 1455 1456 if (toep->tp_tp == NULL) 1457 goto done; 1458 1459 inp = toep->tp_tp->t_inpcb; 1460 1461/* 1462 * Don't handle connection retry for now 1463 */ 1464#ifdef notyet 1465 struct inet_connection_sock *icsk = inet_csk(sk); 1466 1467 if (rpl->status == CPL_ERR_CONN_EXIST && 1468 icsk->icsk_retransmit_timer.function != act_open_retry_timer) { 1469 icsk->icsk_retransmit_timer.function = act_open_retry_timer; 1470 sk_reset_timer(so, &icsk->icsk_retransmit_timer, 1471 jiffies + HZ / 2); 1472 } else 1473#endif 1474 { 1475 inp_wlock(inp); 1476 /* 1477 * drops the inpcb lock 1478 */ 1479 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status)); 1480 } 1481 1482 done: 1483 m_free(m); 1484} 1485 1486/* 1487 * Return whether a failed active open has allocated a TID 1488 */ 1489static inline int 1490act_open_has_tid(int status) 1491{ 1492 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && 1493 status != CPL_ERR_ARP_MISS; 1494} 1495 1496/* 1497 * Process an ACT_OPEN_RPL CPL message. 1498 */ 1499static int 1500do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1501{ 1502 struct toepcb *toep = (struct toepcb *)ctx; 1503 struct cpl_act_open_rpl *rpl = cplhdr(m); 1504 1505 if (cdev->type != T3A && act_open_has_tid(rpl->status)) 1506 cxgb_queue_tid_release(cdev, GET_TID(rpl)); 1507 1508 active_open_failed(toep, m); 1509 return (0); 1510} 1511 1512/* 1513 * Handle an ARP failure for an active open. XXX purge ofo queue 1514 * 1515 * XXX badly broken for crossed SYNs as the ATID is no longer valid. 1516 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should 1517 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't 1518 * free the atid. Hmm. 1519 */ 1520#ifdef notyet 1521static void 1522act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m) 1523{ 1524 struct toepcb *toep = m_get_toep(m); 1525 struct tcpcb *tp = toep->tp_tp; 1526 struct inpcb *inp = tp->t_inpcb; 1527 struct socket *so; 1528 1529 inp_wlock(inp); 1530 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) { 1531 /* 1532 * drops the inpcb lock 1533 */ 1534 fail_act_open(so, EHOSTUNREACH); 1535 printf("freeing %p\n", m); 1536 1537 m_free(m); 1538 } else 1539 inp_wunlock(inp); 1540} 1541#endif 1542/* 1543 * Send an active open request. 1544 */ 1545int 1546t3_connect(struct toedev *tdev, struct socket *so, 1547 struct rtentry *rt, struct sockaddr *nam) 1548{ 1549 struct mbuf *m; 1550 struct l2t_entry *e; 1551 struct tom_data *d = TOM_DATA(tdev); 1552 struct inpcb *inp = so_sotoinpcb(so); 1553 struct tcpcb *tp = intotcpcb(inp); 1554 struct toepcb *toep; /* allocated by init_offload_socket */ 1555 1556 int atid; 1557 1558 toep = toepcb_alloc(); 1559 if (toep == NULL) 1560 goto out_err; 1561 1562 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0) 1563 goto out_err; 1564 1565 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam); 1566 if (!e) 1567 goto free_tid; 1568 1569 inp_lock_assert(inp); 1570 m = m_gethdr(MT_DATA, M_WAITOK); 1571 1572#if 0 1573 m->m_toe.mt_toepcb = tp->t_toe; 1574 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure); 1575#endif 1576 so_lock(so); 1577 1578 init_offload_socket(so, tdev, atid, e, rt, toep); 1579 1580 install_offload_ops(so); 1581 1582 mk_act_open_req(so, m, atid, e); 1583 so_unlock(so); 1584 1585 soisconnecting(so); 1586 toep = tp->t_toe; 1587 m_set_toep(m, tp->t_toe); 1588 1589 toep->tp_state = TCPS_SYN_SENT; 1590 l2t_send(d->cdev, (struct mbuf *)m, e); 1591 1592 if (toep->tp_ulp_mode) 1593 t3_enable_ddp(toep, 0); 1594 return (0); 1595 1596free_tid: 1597 printf("failing connect - free atid\n"); 1598 1599 free_atid(d->cdev, atid); 1600out_err: 1601 printf("return ENOMEM\n"); 1602 return (ENOMEM); 1603} 1604 1605/* 1606 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do 1607 * not send multiple ABORT_REQs for the same connection and also that we do 1608 * not try to send a message after the connection has closed. Returns 1 if 1609 * an ABORT_REQ wasn't generated after all, 0 otherwise. 1610 */ 1611static void 1612t3_send_reset(struct toepcb *toep) 1613{ 1614 1615 struct cpl_abort_req *req; 1616 unsigned int tid = toep->tp_tid; 1617 int mode = CPL_ABORT_SEND_RST; 1618 struct tcpcb *tp = toep->tp_tp; 1619 struct toedev *tdev = toep->tp_toedev; 1620 struct socket *so = NULL; 1621 struct mbuf *m; 1622 struct sockbuf *snd; 1623 1624 if (tp) { 1625 inp_lock_assert(tp->t_inpcb); 1626 so = inp_inpcbtosocket(tp->t_inpcb); 1627 } 1628 1629 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) || 1630 tdev == NULL)) 1631 return; 1632 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN); 1633 1634 snd = so_sockbuf_snd(so); 1635 /* Purge the send queue so we don't send anything after an abort. */ 1636 if (so) 1637 sbflush(snd); 1638 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev)) 1639 mode |= CPL_ABORT_POST_CLOSE_REQ; 1640 1641 m = m_gethdr_nofail(sizeof(*req)); 1642 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep)); 1643 set_arp_failure_handler(m, abort_arp_failure); 1644 1645 req = mtod(m, struct cpl_abort_req *); 1646 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); 1647 req->wr.wr_lo = htonl(V_WR_TID(tid)); 1648 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); 1649 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0; 1650 req->rsvd1 = !(toep->tp_flags & TP_DATASENT); 1651 req->cmd = mode; 1652 if (tp && (tp->t_state == TCPS_SYN_SENT)) 1653 mbufq_tail(&toep->out_of_order_queue, m); // defer 1654 else 1655 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); 1656} 1657 1658static int 1659t3_ip_ctloutput(struct socket *so, struct sockopt *sopt) 1660{ 1661 struct inpcb *inp; 1662 int error, optval; 1663 1664 if (sopt->sopt_name == IP_OPTIONS) 1665 return (ENOPROTOOPT); 1666 1667 if (sopt->sopt_name != IP_TOS) 1668 return (EOPNOTSUPP); 1669 1670 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 1671 1672 if (error) 1673 return (error); 1674 1675 if (optval > IPTOS_PREC_CRITIC_ECP) 1676 return (EINVAL); 1677 1678 inp = so_sotoinpcb(so); 1679 inp_wlock(inp); 1680 inp_ip_tos_set(inp, optval); 1681#if 0 1682 inp->inp_ip_tos = optval; 1683#endif 1684 t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe); 1685 inp_wunlock(inp); 1686 1687 return (0); 1688} 1689 1690static int 1691t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt) 1692{ 1693 int err = 0; 1694 size_t copied; 1695 1696 if (sopt->sopt_name != TCP_CONGESTION && 1697 sopt->sopt_name != TCP_NODELAY) 1698 return (EOPNOTSUPP); 1699 1700 if (sopt->sopt_name == TCP_CONGESTION) { 1701 char name[TCP_CA_NAME_MAX]; 1702 int optlen = sopt->sopt_valsize; 1703 struct tcpcb *tp; 1704 1705 if (sopt->sopt_dir == SOPT_GET) { 1706 KASSERT(0, ("unimplemented")); 1707 return (EOPNOTSUPP); 1708 } 1709 1710 if (optlen < 1) 1711 return (EINVAL); 1712 1713 err = copyinstr(sopt->sopt_val, name, 1714 min(TCP_CA_NAME_MAX - 1, optlen), &copied); 1715 if (err) 1716 return (err); 1717 if (copied < 1) 1718 return (EINVAL); 1719 1720 tp = so_sototcpcb(so); 1721 /* 1722 * XXX I need to revisit this 1723 */ 1724 if ((err = t3_set_cong_control(so, name)) == 0) { 1725#ifdef CONGESTION_CONTROL_SUPPORTED 1726 tp->t_cong_control = strdup(name, M_CXGB); 1727#endif 1728 } else 1729 return (err); 1730 } else { 1731 int optval, oldval; 1732 struct inpcb *inp; 1733 struct tcpcb *tp; 1734 1735 if (sopt->sopt_dir == SOPT_GET) 1736 return (EOPNOTSUPP); 1737 1738 err = sooptcopyin(sopt, &optval, sizeof optval, 1739 sizeof optval); 1740 1741 if (err) 1742 return (err); 1743 1744 inp = so_sotoinpcb(so); 1745 inp_wlock(inp); 1746 tp = inp_inpcbtotcpcb(inp); 1747 1748 oldval = tp->t_flags; 1749 if (optval) 1750 tp->t_flags |= TF_NODELAY; 1751 else 1752 tp->t_flags &= ~TF_NODELAY; 1753 inp_wunlock(inp); 1754 1755 1756 if (oldval != tp->t_flags && (tp->t_toe != NULL)) 1757 t3_set_nagle(tp->t_toe); 1758 1759 } 1760 1761 return (0); 1762} 1763 1764int 1765t3_ctloutput(struct socket *so, struct sockopt *sopt) 1766{ 1767 int err; 1768 1769 if (sopt->sopt_level != IPPROTO_TCP) 1770 err = t3_ip_ctloutput(so, sopt); 1771 else 1772 err = t3_tcp_ctloutput(so, sopt); 1773 1774 if (err != EOPNOTSUPP) 1775 return (err); 1776 1777 return (tcp_ctloutput(so, sopt)); 1778} 1779 1780/* 1781 * Returns true if we need to explicitly request RST when we receive new data 1782 * on an RX-closed connection. 1783 */ 1784static inline int 1785need_rst_on_excess_rx(const struct toepcb *toep) 1786{ 1787 return (1); 1788} 1789 1790/* 1791 * Handles Rx data that arrives in a state where the socket isn't accepting 1792 * new data. 1793 */ 1794static void 1795handle_excess_rx(struct toepcb *toep, struct mbuf *m) 1796{ 1797 1798 if (need_rst_on_excess_rx(toep) && 1799 !(toep->tp_flags & TP_ABORT_SHUTDOWN)) 1800 t3_send_reset(toep); 1801 m_freem(m); 1802} 1803 1804/* 1805 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE) 1806 * by getting the DDP offset from the TCB. 1807 */ 1808static void 1809tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m) 1810{ 1811 struct ddp_state *q = &toep->tp_ddp_state; 1812 struct ddp_buf_state *bsp; 1813 struct cpl_get_tcb_rpl *hdr; 1814 unsigned int ddp_offset; 1815 struct socket *so; 1816 struct tcpcb *tp; 1817 struct sockbuf *rcv; 1818 int state; 1819 1820 uint64_t t; 1821 __be64 *tcb; 1822 1823 tp = toep->tp_tp; 1824 so = inp_inpcbtosocket(tp->t_inpcb); 1825 1826 inp_lock_assert(tp->t_inpcb); 1827 rcv = so_sockbuf_rcv(so); 1828 sockbuf_lock(rcv); 1829 1830 /* Note that we only accout for CPL_GET_TCB issued by the DDP code. 1831 * We really need a cookie in order to dispatch the RPLs. 1832 */ 1833 q->get_tcb_count--; 1834 1835 /* It is a possible that a previous CPL already invalidated UBUF DDP 1836 * and moved the cur_buf idx and hence no further processing of this 1837 * skb is required. However, the app might be sleeping on 1838 * !q->get_tcb_count and we need to wake it up. 1839 */ 1840 if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) { 1841 int state = so_state_get(so); 1842 1843 m_freem(m); 1844 if (__predict_true((state & SS_NOFDREF) == 0)) 1845 so_sorwakeup_locked(so); 1846 else 1847 sockbuf_unlock(rcv); 1848 1849 return; 1850 } 1851 1852 bsp = &q->buf_state[q->cur_buf]; 1853 hdr = cplhdr(m); 1854 tcb = (__be64 *)(hdr + 1); 1855 if (q->cur_buf == 0) { 1856 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]); 1857 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET); 1858 } else { 1859 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]); 1860 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET; 1861 } 1862 ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET; 1863 m->m_cur_offset = bsp->cur_offset; 1864 bsp->cur_offset = ddp_offset; 1865 m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset; 1866 1867 CTR5(KTR_TOM, 1868 "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u", 1869 q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset); 1870 KASSERT(ddp_offset >= m->m_cur_offset, 1871 ("ddp_offset=%u less than cur_offset=%u", 1872 ddp_offset, m->m_cur_offset)); 1873 1874#if 0 1875{ 1876 unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx; 1877 1878 t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]); 1879 ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS; 1880 1881 t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]); 1882 rcv_nxt = t >> S_TCB_RCV_NXT; 1883 rcv_nxt &= M_TCB_RCV_NXT; 1884 1885 t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]); 1886 rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET); 1887 rx_hdr_offset &= M_TCB_RX_HDR_OFFSET; 1888 1889 T3_TRACE2(TIDTB(sk), 1890 "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x", 1891 ddp_flags, rcv_nxt - rx_hdr_offset); 1892 T3_TRACE4(TB(q), 1893 "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u", 1894 tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf); 1895 T3_TRACE3(TB(q), 1896 "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u", 1897 rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset); 1898 T3_TRACE2(TB(q), 1899 "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x", 1900 q->buf_state[0].flags, q->buf_state[1].flags); 1901 1902} 1903#endif 1904 if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) { 1905 handle_excess_rx(toep, m); 1906 return; 1907 } 1908 1909#ifdef T3_TRACE 1910 if ((int)m->m_pkthdr.len < 0) { 1911 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len"); 1912 } 1913#endif 1914 if (bsp->flags & DDP_BF_NOCOPY) { 1915#ifdef T3_TRACE 1916 T3_TRACE0(TB(q), 1917 "tcb_rpl_as_ddp_complete: CANCEL UBUF"); 1918 1919 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) { 1920 printk("!cancel_ubuf"); 1921 t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf"); 1922 } 1923#endif 1924 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1; 1925 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA); 1926 q->cur_buf ^= 1; 1927 } else if (bsp->flags & DDP_BF_NOFLIP) { 1928 1929 m->m_ddp_flags = 1; /* always a kernel buffer */ 1930 1931 /* now HW buffer carries a user buffer */ 1932 bsp->flags &= ~DDP_BF_NOFLIP; 1933 bsp->flags |= DDP_BF_NOCOPY; 1934 1935 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate 1936 * any new data in which case we're done. If in addition the 1937 * offset is 0, then there wasn't a completion for the kbuf 1938 * and we need to decrement the posted count. 1939 */ 1940 if (m->m_pkthdr.len == 0) { 1941 if (ddp_offset == 0) { 1942 q->kbuf_posted--; 1943 bsp->flags |= DDP_BF_NODATA; 1944 } 1945 sockbuf_unlock(rcv); 1946 m_free(m); 1947 return; 1948 } 1949 } else { 1950 sockbuf_unlock(rcv); 1951 1952 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP, 1953 * but it got here way late and nobody cares anymore. 1954 */ 1955 m_free(m); 1956 return; 1957 } 1958 1959 m->m_ddp_gl = (unsigned char *)bsp->gl; 1960 m->m_flags |= M_DDP; 1961 m->m_seq = tp->rcv_nxt; 1962 tp->rcv_nxt += m->m_pkthdr.len; 1963 tp->t_rcvtime = ticks; 1964 CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u", 1965 m->m_seq, q->cur_buf, m->m_pkthdr.len); 1966 if (m->m_pkthdr.len == 0) { 1967 q->user_ddp_pending = 0; 1968 m_free(m); 1969 } else 1970 SBAPPEND(rcv, m); 1971 1972 state = so_state_get(so); 1973 if (__predict_true((state & SS_NOFDREF) == 0)) 1974 so_sorwakeup_locked(so); 1975 else 1976 sockbuf_unlock(rcv); 1977} 1978 1979/* 1980 * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code, 1981 * in that case they are similar to DDP completions. 1982 */ 1983static int 1984do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 1985{ 1986 struct toepcb *toep = (struct toepcb *)ctx; 1987 1988 /* OK if socket doesn't exist */ 1989 if (toep == NULL) { 1990 printf("null toep in do_get_tcb_rpl\n"); 1991 return (CPL_RET_BUF_DONE); 1992 } 1993 1994 inp_wlock(toep->tp_tp->t_inpcb); 1995 tcb_rpl_as_ddp_complete(toep, m); 1996 inp_wunlock(toep->tp_tp->t_inpcb); 1997 1998 return (0); 1999} 2000 2001static void 2002handle_ddp_data(struct toepcb *toep, struct mbuf *m) 2003{ 2004 struct tcpcb *tp = toep->tp_tp; 2005 struct socket *so; 2006 struct ddp_state *q; 2007 struct ddp_buf_state *bsp; 2008 struct cpl_rx_data *hdr = cplhdr(m); 2009 unsigned int rcv_nxt = ntohl(hdr->seq); 2010 struct sockbuf *rcv; 2011 2012 if (tp->rcv_nxt == rcv_nxt) 2013 return; 2014 2015 inp_lock_assert(tp->t_inpcb); 2016 so = inp_inpcbtosocket(tp->t_inpcb); 2017 rcv = so_sockbuf_rcv(so); 2018 sockbuf_lock(rcv); 2019 2020 q = &toep->tp_ddp_state; 2021 bsp = &q->buf_state[q->cur_buf]; 2022 KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x", 2023 rcv_nxt, tp->rcv_nxt)); 2024 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; 2025 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2026 CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d", 2027 rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len); 2028 2029#ifdef T3_TRACE 2030 if ((int)m->m_pkthdr.len < 0) { 2031 t3_ddp_error(so, "handle_ddp_data: neg len"); 2032 } 2033#endif 2034 m->m_ddp_gl = (unsigned char *)bsp->gl; 2035 m->m_flags |= M_DDP; 2036 m->m_cur_offset = bsp->cur_offset; 2037 m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; 2038 if (bsp->flags & DDP_BF_NOCOPY) 2039 bsp->flags &= ~DDP_BF_NOCOPY; 2040 2041 m->m_seq = tp->rcv_nxt; 2042 tp->rcv_nxt = rcv_nxt; 2043 bsp->cur_offset += m->m_pkthdr.len; 2044 if (!(bsp->flags & DDP_BF_NOFLIP)) 2045 q->cur_buf ^= 1; 2046 /* 2047 * For now, don't re-enable DDP after a connection fell out of DDP 2048 * mode. 2049 */ 2050 q->ubuf_ddp_ready = 0; 2051 sockbuf_unlock(rcv); 2052} 2053 2054/* 2055 * Process new data received for a connection. 2056 */ 2057static void 2058new_rx_data(struct toepcb *toep, struct mbuf *m) 2059{ 2060 struct cpl_rx_data *hdr = cplhdr(m); 2061 struct tcpcb *tp = toep->tp_tp; 2062 struct socket *so; 2063 struct sockbuf *rcv; 2064 int state; 2065 int len = be16toh(hdr->len); 2066 2067 inp_wlock(tp->t_inpcb); 2068 2069 so = inp_inpcbtosocket(tp->t_inpcb); 2070 2071 if (__predict_false(so_no_receive(so))) { 2072 handle_excess_rx(toep, m); 2073 inp_wunlock(tp->t_inpcb); 2074 TRACE_EXIT; 2075 return; 2076 } 2077 2078 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) 2079 handle_ddp_data(toep, m); 2080 2081 m->m_seq = ntohl(hdr->seq); 2082 m->m_ulp_mode = 0; /* for iSCSI */ 2083 2084#if VALIDATE_SEQ 2085 if (__predict_false(m->m_seq != tp->rcv_nxt)) { 2086 log(LOG_ERR, 2087 "%s: TID %u: Bad sequence number %u, expected %u\n", 2088 toep->tp_toedev->name, toep->tp_tid, m->m_seq, 2089 tp->rcv_nxt); 2090 m_freem(m); 2091 inp_wunlock(tp->t_inpcb); 2092 return; 2093 } 2094#endif 2095 m_adj(m, sizeof(*hdr)); 2096 2097#ifdef URGENT_DATA_SUPPORTED 2098 /* 2099 * We don't handle urgent data yet 2100 */ 2101 if (__predict_false(hdr->urg)) 2102 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg)); 2103 if (__predict_false(tp->urg_data == TCP_URG_NOTYET && 2104 tp->urg_seq - tp->rcv_nxt < skb->len)) 2105 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - 2106 tp->rcv_nxt]; 2107#endif 2108 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) { 2109 toep->tp_delack_mode = hdr->dack_mode; 2110 toep->tp_delack_seq = tp->rcv_nxt; 2111 } 2112 CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d", 2113 m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes); 2114 2115 if (len < m->m_pkthdr.len) 2116 m->m_pkthdr.len = m->m_len = len; 2117 2118 tp->rcv_nxt += m->m_pkthdr.len; 2119 tp->t_rcvtime = ticks; 2120 toep->tp_enqueued_bytes += m->m_pkthdr.len; 2121 CTR2(KTR_TOM, 2122 "new_rx_data: seq 0x%x len %u", 2123 m->m_seq, m->m_pkthdr.len); 2124 inp_wunlock(tp->t_inpcb); 2125 rcv = so_sockbuf_rcv(so); 2126 sockbuf_lock(rcv); 2127#if 0 2128 if (sb_notify(rcv)) 2129 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len); 2130#endif 2131 SBAPPEND(rcv, m); 2132 2133#ifdef notyet 2134 /* 2135 * We're giving too many credits to the card - but disable this check so we can keep on moving :-| 2136 * 2137 */ 2138 KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1), 2139 2140 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d", 2141 so, rcv->sb_cc, rcv->sb_mbmax)); 2142#endif 2143 2144 2145 CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d", 2146 rcv->sb_cc, rcv->sb_mbcnt); 2147 2148 state = so_state_get(so); 2149 if (__predict_true((state & SS_NOFDREF) == 0)) 2150 so_sorwakeup_locked(so); 2151 else 2152 sockbuf_unlock(rcv); 2153} 2154 2155/* 2156 * Handler for RX_DATA CPL messages. 2157 */ 2158static int 2159do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2160{ 2161 struct toepcb *toep = (struct toepcb *)ctx; 2162 2163 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len); 2164 2165 new_rx_data(toep, m); 2166 2167 return (0); 2168} 2169 2170static void 2171new_rx_data_ddp(struct toepcb *toep, struct mbuf *m) 2172{ 2173 struct tcpcb *tp; 2174 struct ddp_state *q; 2175 struct ddp_buf_state *bsp; 2176 struct cpl_rx_data_ddp *hdr; 2177 struct socket *so; 2178 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; 2179 int nomoredata = 0; 2180 unsigned int delack_mode; 2181 struct sockbuf *rcv; 2182 2183 tp = toep->tp_tp; 2184 inp_wlock(tp->t_inpcb); 2185 so = inp_inpcbtosocket(tp->t_inpcb); 2186 2187 if (__predict_false(so_no_receive(so))) { 2188 2189 handle_excess_rx(toep, m); 2190 inp_wunlock(tp->t_inpcb); 2191 return; 2192 } 2193 2194 q = &toep->tp_ddp_state; 2195 hdr = cplhdr(m); 2196 ddp_report = ntohl(hdr->u.ddp_report); 2197 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 2198 bsp = &q->buf_state[buf_idx]; 2199 2200 CTR4(KTR_TOM, 2201 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " 2202 "hdr seq 0x%x len %u", 2203 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), 2204 ntohs(hdr->len)); 2205 CTR3(KTR_TOM, 2206 "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d", 2207 G_DDP_OFFSET(ddp_report), ddp_report, buf_idx); 2208 2209 ddp_len = ntohs(hdr->len); 2210 rcv_nxt = ntohl(hdr->seq) + ddp_len; 2211 2212 delack_mode = G_DDP_DACK_MODE(ddp_report); 2213 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { 2214 toep->tp_delack_mode = delack_mode; 2215 toep->tp_delack_seq = tp->rcv_nxt; 2216 } 2217 2218 m->m_seq = tp->rcv_nxt; 2219 tp->rcv_nxt = rcv_nxt; 2220 2221 tp->t_rcvtime = ticks; 2222 /* 2223 * Store the length in m->m_len. We are changing the meaning of 2224 * m->m_len here, we need to be very careful that nothing from now on 2225 * interprets ->len of this packet the usual way. 2226 */ 2227 m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq; 2228 inp_wunlock(tp->t_inpcb); 2229 CTR3(KTR_TOM, 2230 "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ", 2231 m->m_len, rcv_nxt, m->m_seq); 2232 /* 2233 * Figure out where the new data was placed in the buffer and store it 2234 * in when. Assumes the buffer offset starts at 0, consumer needs to 2235 * account for page pod's pg_offset. 2236 */ 2237 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; 2238 m->m_cur_offset = end_offset - m->m_pkthdr.len; 2239 2240 rcv = so_sockbuf_rcv(so); 2241 sockbuf_lock(rcv); 2242 2243 m->m_ddp_gl = (unsigned char *)bsp->gl; 2244 m->m_flags |= M_DDP; 2245 bsp->cur_offset = end_offset; 2246 toep->tp_enqueued_bytes += m->m_pkthdr.len; 2247 2248 /* 2249 * Length is only meaningful for kbuf 2250 */ 2251 if (!(bsp->flags & DDP_BF_NOCOPY)) 2252 KASSERT(m->m_len <= bsp->gl->dgl_length, 2253 ("length received exceeds ddp pages: len=%d dgl_length=%d", 2254 m->m_len, bsp->gl->dgl_length)); 2255 2256 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2257 KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next)); 2258 /* 2259 * Bit 0 of flags stores whether the DDP buffer is completed. 2260 * Note that other parts of the code depend on this being in bit 0. 2261 */ 2262 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { 2263 panic("spurious ddp completion"); 2264 } else { 2265 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); 2266 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP)) 2267 q->cur_buf ^= 1; /* flip buffers */ 2268 } 2269 2270 if (bsp->flags & DDP_BF_NOCOPY) { 2271 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY); 2272 bsp->flags &= ~DDP_BF_NOCOPY; 2273 } 2274 2275 if (ddp_report & F_DDP_PSH) 2276 m->m_ddp_flags |= DDP_BF_PSH; 2277 if (nomoredata) 2278 m->m_ddp_flags |= DDP_BF_NODATA; 2279 2280#ifdef notyet 2281 skb_reset_transport_header(skb); 2282 tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */ 2283#endif 2284 SBAPPEND(rcv, m); 2285 2286 if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) || 2287 (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1)) 2288 || !(m->m_ddp_flags & DDP_BF_NOCOPY)))) 2289 so_sorwakeup_locked(so); 2290 else 2291 sockbuf_unlock(rcv); 2292} 2293 2294#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 2295 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 2296 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 2297 F_DDP_INVALID_PPOD) 2298 2299/* 2300 * Handler for RX_DATA_DDP CPL messages. 2301 */ 2302static int 2303do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2304{ 2305 struct toepcb *toep = ctx; 2306 const struct cpl_rx_data_ddp *hdr = cplhdr(m); 2307 2308 VALIDATE_SOCK(so); 2309 2310 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) { 2311 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n", 2312 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); 2313 return (CPL_RET_BUF_DONE); 2314 } 2315#if 0 2316 skb->h.th = tcphdr_skb->h.th; 2317#endif 2318 new_rx_data_ddp(toep, m); 2319 return (0); 2320} 2321 2322static void 2323process_ddp_complete(struct toepcb *toep, struct mbuf *m) 2324{ 2325 struct tcpcb *tp = toep->tp_tp; 2326 struct socket *so; 2327 struct ddp_state *q; 2328 struct ddp_buf_state *bsp; 2329 struct cpl_rx_ddp_complete *hdr; 2330 unsigned int ddp_report, buf_idx, when, delack_mode; 2331 int nomoredata = 0; 2332 struct sockbuf *rcv; 2333 2334 inp_wlock(tp->t_inpcb); 2335 so = inp_inpcbtosocket(tp->t_inpcb); 2336 2337 if (__predict_false(so_no_receive(so))) { 2338 struct inpcb *inp = so_sotoinpcb(so); 2339 2340 handle_excess_rx(toep, m); 2341 inp_wunlock(inp); 2342 return; 2343 } 2344 q = &toep->tp_ddp_state; 2345 hdr = cplhdr(m); 2346 ddp_report = ntohl(hdr->ddp_report); 2347 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; 2348 m->m_pkthdr.csum_data = tp->rcv_nxt; 2349 2350 rcv = so_sockbuf_rcv(so); 2351 sockbuf_lock(rcv); 2352 2353 bsp = &q->buf_state[buf_idx]; 2354 when = bsp->cur_offset; 2355 m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when; 2356 tp->rcv_nxt += m->m_len; 2357 tp->t_rcvtime = ticks; 2358 2359 delack_mode = G_DDP_DACK_MODE(ddp_report); 2360 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) { 2361 toep->tp_delack_mode = delack_mode; 2362 toep->tp_delack_seq = tp->rcv_nxt; 2363 } 2364#ifdef notyet 2365 skb_reset_transport_header(skb); 2366 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ 2367#endif 2368 inp_wunlock(tp->t_inpcb); 2369 2370 KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2371 CTR5(KTR_TOM, 2372 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 2373 "ddp_report 0x%x offset %u, len %u", 2374 tp->rcv_nxt, bsp->cur_offset, ddp_report, 2375 G_DDP_OFFSET(ddp_report), m->m_len); 2376 2377 m->m_cur_offset = bsp->cur_offset; 2378 bsp->cur_offset += m->m_len; 2379 2380 if (!(bsp->flags & DDP_BF_NOFLIP)) { 2381 q->cur_buf ^= 1; /* flip buffers */ 2382 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length) 2383 nomoredata=1; 2384 } 2385 2386 CTR4(KTR_TOM, 2387 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " 2388 "ddp_report %u offset %u", 2389 tp->rcv_nxt, bsp->cur_offset, ddp_report, 2390 G_DDP_OFFSET(ddp_report)); 2391 2392 m->m_ddp_gl = (unsigned char *)bsp->gl; 2393 m->m_flags |= M_DDP; 2394 m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1; 2395 if (bsp->flags & DDP_BF_NOCOPY) 2396 bsp->flags &= ~DDP_BF_NOCOPY; 2397 if (nomoredata) 2398 m->m_ddp_flags |= DDP_BF_NODATA; 2399 2400 SBAPPEND(rcv, m); 2401 if ((so_state_get(so) & SS_NOFDREF) == 0) 2402 so_sorwakeup_locked(so); 2403 else 2404 sockbuf_unlock(rcv); 2405} 2406 2407/* 2408 * Handler for RX_DDP_COMPLETE CPL messages. 2409 */ 2410static int 2411do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2412{ 2413 struct toepcb *toep = ctx; 2414 2415 VALIDATE_SOCK(so); 2416#if 0 2417 skb->h.th = tcphdr_skb->h.th; 2418#endif 2419 process_ddp_complete(toep, m); 2420 return (0); 2421} 2422 2423/* 2424 * Move a socket to TIME_WAIT state. We need to make some adjustments to the 2425 * socket state before calling tcp_time_wait to comply with its expectations. 2426 */ 2427static void 2428enter_timewait(struct tcpcb *tp) 2429{ 2430 /* 2431 * Bump rcv_nxt for the peer FIN. We don't do this at the time we 2432 * process peer_close because we don't want to carry the peer FIN in 2433 * the socket's receive queue and if we increment rcv_nxt without 2434 * having the FIN in the receive queue we'll confuse facilities such 2435 * as SIOCINQ. 2436 */ 2437 inp_wlock(tp->t_inpcb); 2438 tp->rcv_nxt++; 2439 2440 tp->ts_recent_age = 0; /* defeat recycling */ 2441 tp->t_srtt = 0; /* defeat tcp_update_metrics */ 2442 inp_wunlock(tp->t_inpcb); 2443 tcp_offload_twstart(tp); 2444} 2445 2446/* 2447 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This 2448 * function deals with the data that may be reported along with the FIN. 2449 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to 2450 * perform normal FIN-related processing. In the latter case 1 indicates that 2451 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the 2452 * skb can be freed. 2453 */ 2454static int 2455handle_peer_close_data(struct socket *so, struct mbuf *m) 2456{ 2457 struct tcpcb *tp = so_sototcpcb(so); 2458 struct toepcb *toep = tp->t_toe; 2459 struct ddp_state *q; 2460 struct ddp_buf_state *bsp; 2461 struct cpl_peer_close *req = cplhdr(m); 2462 unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */ 2463 struct sockbuf *rcv; 2464 2465 if (tp->rcv_nxt == rcv_nxt) /* no data */ 2466 return (0); 2467 2468 CTR0(KTR_TOM, "handle_peer_close_data"); 2469 if (__predict_false(so_no_receive(so))) { 2470 handle_excess_rx(toep, m); 2471 2472 /* 2473 * Although we discard the data we want to process the FIN so 2474 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP + 2475 * PEER_CLOSE without data. In particular this PEER_CLOSE 2476 * may be what will close the connection. We return 1 because 2477 * handle_excess_rx() already freed the packet. 2478 */ 2479 return (1); 2480 } 2481 2482 inp_lock_assert(tp->t_inpcb); 2483 q = &toep->tp_ddp_state; 2484 rcv = so_sockbuf_rcv(so); 2485 sockbuf_lock(rcv); 2486 2487 bsp = &q->buf_state[q->cur_buf]; 2488 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; 2489 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len)); 2490 m->m_ddp_gl = (unsigned char *)bsp->gl; 2491 m->m_flags |= M_DDP; 2492 m->m_cur_offset = bsp->cur_offset; 2493 m->m_ddp_flags = 2494 DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; 2495 m->m_seq = tp->rcv_nxt; 2496 tp->rcv_nxt = rcv_nxt; 2497 bsp->cur_offset += m->m_pkthdr.len; 2498 if (!(bsp->flags & DDP_BF_NOFLIP)) 2499 q->cur_buf ^= 1; 2500#ifdef notyet 2501 skb_reset_transport_header(skb); 2502 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ 2503#endif 2504 tp->t_rcvtime = ticks; 2505 SBAPPEND(rcv, m); 2506 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) 2507 so_sorwakeup_locked(so); 2508 else 2509 sockbuf_unlock(rcv); 2510 2511 return (1); 2512} 2513 2514/* 2515 * Handle a peer FIN. 2516 */ 2517static void 2518do_peer_fin(struct toepcb *toep, struct mbuf *m) 2519{ 2520 struct socket *so; 2521 struct tcpcb *tp = toep->tp_tp; 2522 int keep, action; 2523 2524 action = keep = 0; 2525 CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state); 2526 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 2527 printf("abort_pending set\n"); 2528 2529 goto out; 2530 } 2531 inp_wlock(tp->t_inpcb); 2532 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 2533 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) { 2534 keep = handle_peer_close_data(so, m); 2535 if (keep < 0) { 2536 inp_wunlock(tp->t_inpcb); 2537 return; 2538 } 2539 } 2540 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2541 CTR1(KTR_TOM, 2542 "waking up waiters for cantrcvmore on %p ", so); 2543 socantrcvmore(so); 2544 2545 /* 2546 * If connection is half-synchronized 2547 * (ie NEEDSYN flag on) then delay ACK, 2548 * so it may be piggybacked when SYN is sent. 2549 * Otherwise, since we received a FIN then no 2550 * more input can be expected, send ACK now. 2551 */ 2552 if (tp->t_flags & TF_NEEDSYN) 2553 tp->t_flags |= TF_DELACK; 2554 else 2555 tp->t_flags |= TF_ACKNOW; 2556 tp->rcv_nxt++; 2557 } 2558 2559 switch (tp->t_state) { 2560 case TCPS_SYN_RECEIVED: 2561 tp->t_starttime = ticks; 2562 /* FALLTHROUGH */ 2563 case TCPS_ESTABLISHED: 2564 tp->t_state = TCPS_CLOSE_WAIT; 2565 break; 2566 case TCPS_FIN_WAIT_1: 2567 tp->t_state = TCPS_CLOSING; 2568 break; 2569 case TCPS_FIN_WAIT_2: 2570 /* 2571 * If we've sent an abort_req we must have sent it too late, 2572 * HW will send us a reply telling us so, and this peer_close 2573 * is really the last message for this connection and needs to 2574 * be treated as an abort_rpl, i.e., transition the connection 2575 * to TCP_CLOSE (note that the host stack does this at the 2576 * time of generating the RST but we must wait for HW). 2577 * Otherwise we enter TIME_WAIT. 2578 */ 2579 t3_release_offload_resources(toep); 2580 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2581 action = TCP_CLOSE; 2582 } else { 2583 action = TCP_TIMEWAIT; 2584 } 2585 break; 2586 default: 2587 log(LOG_ERR, 2588 "%s: TID %u received PEER_CLOSE in bad state %d\n", 2589 toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state); 2590 } 2591 inp_wunlock(tp->t_inpcb); 2592 2593 if (action == TCP_TIMEWAIT) { 2594 enter_timewait(tp); 2595 } else if (action == TCP_DROP) { 2596 tcp_offload_drop(tp, 0); 2597 } else if (action == TCP_CLOSE) { 2598 tcp_offload_close(tp); 2599 } 2600 2601#ifdef notyet 2602 /* Do not send POLL_HUP for half duplex close. */ 2603 if ((sk->sk_shutdown & SEND_SHUTDOWN) || 2604 sk->sk_state == TCP_CLOSE) 2605 sk_wake_async(so, 1, POLL_HUP); 2606 else 2607 sk_wake_async(so, 1, POLL_IN); 2608#endif 2609 2610out: 2611 if (!keep) 2612 m_free(m); 2613} 2614 2615/* 2616 * Handler for PEER_CLOSE CPL messages. 2617 */ 2618static int 2619do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2620{ 2621 struct toepcb *toep = (struct toepcb *)ctx; 2622 2623 VALIDATE_SOCK(so); 2624 2625 do_peer_fin(toep, m); 2626 return (0); 2627} 2628 2629static void 2630process_close_con_rpl(struct toepcb *toep, struct mbuf *m) 2631{ 2632 struct cpl_close_con_rpl *rpl = cplhdr(m); 2633 struct tcpcb *tp = toep->tp_tp; 2634 struct socket *so; 2635 int action = 0; 2636 struct sockbuf *rcv; 2637 2638 inp_wlock(tp->t_inpcb); 2639 so = inp_inpcbtosocket(tp->t_inpcb); 2640 2641 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ 2642 2643 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) { 2644 inp_wunlock(tp->t_inpcb); 2645 goto out; 2646 } 2647 2648 CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep, 2649 tp->t_state, !!(so_state_get(so) & SS_NOFDREF)); 2650 2651 switch (tp->t_state) { 2652 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ 2653 t3_release_offload_resources(toep); 2654 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2655 action = TCP_CLOSE; 2656 2657 } else { 2658 action = TCP_TIMEWAIT; 2659 } 2660 break; 2661 case TCPS_LAST_ACK: 2662 /* 2663 * In this state we don't care about pending abort_rpl. 2664 * If we've sent abort_req it was post-close and was sent too 2665 * late, this close_con_rpl is the actual last message. 2666 */ 2667 t3_release_offload_resources(toep); 2668 action = TCP_CLOSE; 2669 break; 2670 case TCPS_FIN_WAIT_1: 2671 /* 2672 * If we can't receive any more 2673 * data, then closing user can proceed. 2674 * Starting the timer is contrary to the 2675 * specification, but if we don't get a FIN 2676 * we'll hang forever. 2677 * 2678 * XXXjl: 2679 * we should release the tp also, and use a 2680 * compressed state. 2681 */ 2682 if (so) 2683 rcv = so_sockbuf_rcv(so); 2684 else 2685 break; 2686 2687 if (rcv->sb_state & SBS_CANTRCVMORE) { 2688 int timeout; 2689 2690 if (so) 2691 soisdisconnected(so); 2692 timeout = (tcp_fast_finwait2_recycle) ? 2693 tcp_finwait2_timeout : tcp_maxidle; 2694 tcp_timer_activate(tp, TT_2MSL, timeout); 2695 } 2696 tp->t_state = TCPS_FIN_WAIT_2; 2697 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 && 2698 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { 2699 action = TCP_DROP; 2700 } 2701 2702 break; 2703 default: 2704 log(LOG_ERR, 2705 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", 2706 toep->tp_toedev->tod_name, toep->tp_tid, 2707 tp->t_state); 2708 } 2709 inp_wunlock(tp->t_inpcb); 2710 2711 2712 if (action == TCP_TIMEWAIT) { 2713 enter_timewait(tp); 2714 } else if (action == TCP_DROP) { 2715 tcp_offload_drop(tp, 0); 2716 } else if (action == TCP_CLOSE) { 2717 tcp_offload_close(tp); 2718 } 2719out: 2720 m_freem(m); 2721} 2722 2723/* 2724 * Handler for CLOSE_CON_RPL CPL messages. 2725 */ 2726static int 2727do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m, 2728 void *ctx) 2729{ 2730 struct toepcb *toep = (struct toepcb *)ctx; 2731 2732 process_close_con_rpl(toep, m); 2733 return (0); 2734} 2735 2736/* 2737 * Process abort replies. We only process these messages if we anticipate 2738 * them as the coordination between SW and HW in this area is somewhat lacking 2739 * and sometimes we get ABORT_RPLs after we are done with the connection that 2740 * originated the ABORT_REQ. 2741 */ 2742static void 2743process_abort_rpl(struct toepcb *toep, struct mbuf *m) 2744{ 2745 struct tcpcb *tp = toep->tp_tp; 2746 struct socket *so; 2747 int needclose = 0; 2748 2749#ifdef T3_TRACE 2750 T3_TRACE1(TIDTB(sk), 2751 "process_abort_rpl: GTS rpl pending %d", 2752 sock_flag(sk, ABORT_RPL_PENDING)); 2753#endif 2754 2755 inp_wlock(tp->t_inpcb); 2756 so = inp_inpcbtosocket(tp->t_inpcb); 2757 2758 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 2759 /* 2760 * XXX panic on tcpdrop 2761 */ 2762 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev)) 2763 toep->tp_flags |= TP_ABORT_RPL_RCVD; 2764 else { 2765 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING); 2766 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) || 2767 !is_t3a(toep->tp_toedev)) { 2768 if (toep->tp_flags & TP_ABORT_REQ_RCVD) 2769 panic("TP_ABORT_REQ_RCVD set"); 2770 t3_release_offload_resources(toep); 2771 needclose = 1; 2772 } 2773 } 2774 } 2775 inp_wunlock(tp->t_inpcb); 2776 2777 if (needclose) 2778 tcp_offload_close(tp); 2779 2780 m_free(m); 2781} 2782 2783/* 2784 * Handle an ABORT_RPL_RSS CPL message. 2785 */ 2786static int 2787do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) 2788{ 2789 struct cpl_abort_rpl_rss *rpl = cplhdr(m); 2790 struct toepcb *toep; 2791 2792 /* 2793 * Ignore replies to post-close aborts indicating that the abort was 2794 * requested too late. These connections are terminated when we get 2795 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss 2796 * arrives the TID is either no longer used or it has been recycled. 2797 */ 2798 if (rpl->status == CPL_ERR_ABORT_FAILED) { 2799discard: 2800 m_free(m); 2801 return (0); 2802 } 2803 2804 toep = (struct toepcb *)ctx; 2805 2806 /* 2807 * Sometimes we've already closed the socket, e.g., a post-close 2808 * abort races with ABORT_REQ_RSS, the latter frees the socket 2809 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, 2810 * but FW turns the ABORT_REQ into a regular one and so we get 2811 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. 2812 */ 2813 if (!toep) 2814 goto discard; 2815 2816 if (toep->tp_tp == NULL) { 2817 log(LOG_NOTICE, "removing tid for abort\n"); 2818 cxgb_remove_tid(cdev, toep, toep->tp_tid); 2819 if (toep->tp_l2t) 2820 l2t_release(L2DATA(cdev), toep->tp_l2t); 2821 2822 toepcb_release(toep); 2823 goto discard; 2824 } 2825 2826 log(LOG_NOTICE, "toep=%p\n", toep); 2827 log(LOG_NOTICE, "tp=%p\n", toep->tp_tp); 2828 2829 toepcb_hold(toep); 2830 process_abort_rpl(toep, m); 2831 toepcb_release(toep); 2832 return (0); 2833} 2834 2835/* 2836 * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also 2837 * indicate whether RST should be sent in response. 2838 */ 2839static int 2840abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst) 2841{ 2842 struct tcpcb *tp = so_sototcpcb(so); 2843 2844 switch (abort_reason) { 2845 case CPL_ERR_BAD_SYN: 2846#if 0 2847 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through 2848#endif 2849 case CPL_ERR_CONN_RESET: 2850 // XXX need to handle SYN_RECV due to crossed SYNs 2851 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET); 2852 case CPL_ERR_XMIT_TIMEDOUT: 2853 case CPL_ERR_PERSIST_TIMEDOUT: 2854 case CPL_ERR_FINWAIT2_TIMEDOUT: 2855 case CPL_ERR_KEEPALIVE_TIMEDOUT: 2856#if 0 2857 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); 2858#endif 2859 return (ETIMEDOUT); 2860 default: 2861 return (EIO); 2862 } 2863} 2864 2865static inline void 2866set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd) 2867{ 2868 struct cpl_abort_rpl *rpl = cplhdr(m); 2869 2870 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); 2871 rpl->wr.wr_lo = htonl(V_WR_TID(tid)); 2872 m->m_len = m->m_pkthdr.len = sizeof(*rpl); 2873 2874 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); 2875 rpl->cmd = cmd; 2876} 2877 2878static void 2879send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m) 2880{ 2881 struct mbuf *reply_mbuf; 2882 struct cpl_abort_req_rss *req = cplhdr(m); 2883 2884 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl)); 2885 m_set_priority(m, CPL_PRIORITY_DATA); 2886 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl); 2887 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status); 2888 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2889 m_free(m); 2890} 2891 2892/* 2893 * Returns whether an ABORT_REQ_RSS message is a negative advice. 2894 */ 2895static inline int 2896is_neg_adv_abort(unsigned int status) 2897{ 2898 return status == CPL_ERR_RTX_NEG_ADVICE || 2899 status == CPL_ERR_PERSIST_NEG_ADVICE; 2900} 2901 2902static void 2903send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status) 2904{ 2905 struct mbuf *reply_mbuf; 2906 struct cpl_abort_req_rss *req = cplhdr(m); 2907 2908 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 2909 2910 if (!reply_mbuf) { 2911 /* Defer the reply. Stick rst_status into req->cmd. */ 2912 req->status = rst_status; 2913 t3_defer_reply(m, tdev, send_deferred_abort_rpl); 2914 return; 2915 } 2916 2917 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA); 2918 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status); 2919 m_free(m); 2920 2921 /* 2922 * XXX need to sync with ARP as for SYN_RECV connections we can send 2923 * these messages while ARP is pending. For other connection states 2924 * it's not a problem. 2925 */ 2926 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 2927} 2928 2929#ifdef notyet 2930static void 2931cleanup_syn_rcv_conn(struct socket *child, struct socket *parent) 2932{ 2933 CXGB_UNIMPLEMENTED(); 2934#ifdef notyet 2935 struct request_sock *req = child->sk_user_data; 2936 2937 inet_csk_reqsk_queue_removed(parent, req); 2938 synq_remove(tcp_sk(child)); 2939 __reqsk_free(req); 2940 child->sk_user_data = NULL; 2941#endif 2942} 2943 2944 2945/* 2946 * Performs the actual work to abort a SYN_RECV connection. 2947 */ 2948static void 2949do_abort_syn_rcv(struct socket *child, struct socket *parent) 2950{ 2951 struct tcpcb *parenttp = so_sototcpcb(parent); 2952 struct tcpcb *childtp = so_sototcpcb(child); 2953 2954 /* 2955 * If the server is still open we clean up the child connection, 2956 * otherwise the server already did the clean up as it was purging 2957 * its SYN queue and the skb was just sitting in its backlog. 2958 */ 2959 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) { 2960 cleanup_syn_rcv_conn(child, parent); 2961 inp_wlock(childtp->t_inpcb); 2962 t3_release_offload_resources(childtp->t_toe); 2963 inp_wunlock(childtp->t_inpcb); 2964 tcp_offload_close(childtp); 2965 } 2966} 2967#endif 2968 2969/* 2970 * Handle abort requests for a SYN_RECV connection. These need extra work 2971 * because the socket is on its parent's SYN queue. 2972 */ 2973static int 2974abort_syn_rcv(struct socket *so, struct mbuf *m) 2975{ 2976 CXGB_UNIMPLEMENTED(); 2977#ifdef notyet 2978 struct socket *parent; 2979 struct toedev *tdev = toep->tp_toedev; 2980 struct t3cdev *cdev = TOM_DATA(tdev)->cdev; 2981 struct socket *oreq = so->so_incomp; 2982 struct t3c_tid_entry *t3c_stid; 2983 struct tid_info *t; 2984 2985 if (!oreq) 2986 return -1; /* somehow we are not on the SYN queue */ 2987 2988 t = &(T3C_DATA(cdev))->tid_maps; 2989 t3c_stid = lookup_stid(t, oreq->ts_recent); 2990 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 2991 2992 so_lock(parent); 2993 do_abort_syn_rcv(so, parent); 2994 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST); 2995 so_unlock(parent); 2996#endif 2997 return (0); 2998} 2999 3000/* 3001 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this 3002 * request except that we need to reply to it. 3003 */ 3004static void 3005process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev) 3006{ 3007 int rst_status = CPL_ABORT_NO_RST; 3008 const struct cpl_abort_req_rss *req = cplhdr(m); 3009 struct tcpcb *tp = toep->tp_tp; 3010 struct socket *so; 3011 int needclose = 0; 3012 3013 inp_wlock(tp->t_inpcb); 3014 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb); 3015 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) { 3016 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN); 3017 m_free(m); 3018 goto skip; 3019 } 3020 3021 toep->tp_flags &= ~TP_ABORT_REQ_RCVD; 3022 /* 3023 * Three cases to consider: 3024 * a) We haven't sent an abort_req; close the connection. 3025 * b) We have sent a post-close abort_req that will get to TP too late 3026 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will 3027 * be ignored and the connection should be closed now. 3028 * c) We have sent a regular abort_req that will get to TP too late. 3029 * That will generate an abort_rpl with status 0, wait for it. 3030 */ 3031 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) || 3032 (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) { 3033 int error; 3034 3035 error = abort_status_to_errno(so, req->status, 3036 &rst_status); 3037 so_error_set(so, error); 3038 3039 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0)) 3040 so_sorwakeup(so); 3041 /* 3042 * SYN_RECV needs special processing. If abort_syn_rcv() 3043 * returns 0 is has taken care of the abort. 3044 */ 3045 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m)) 3046 goto skip; 3047 3048 t3_release_offload_resources(toep); 3049 needclose = 1; 3050 } 3051 inp_wunlock(tp->t_inpcb); 3052 3053 if (needclose) 3054 tcp_offload_close(tp); 3055 3056 send_abort_rpl(m, tdev, rst_status); 3057 return; 3058skip: 3059 inp_wunlock(tp->t_inpcb); 3060} 3061 3062/* 3063 * Handle an ABORT_REQ_RSS CPL message. 3064 */ 3065static int 3066do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3067{ 3068 const struct cpl_abort_req_rss *req = cplhdr(m); 3069 struct toepcb *toep = (struct toepcb *)ctx; 3070 3071 if (is_neg_adv_abort(req->status)) { 3072 m_free(m); 3073 return (0); 3074 } 3075 3076 log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid); 3077 3078 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) { 3079 cxgb_remove_tid(cdev, toep, toep->tp_tid); 3080 toep->tp_flags |= TP_ABORT_REQ_RCVD; 3081 3082 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST); 3083 if (toep->tp_l2t) 3084 l2t_release(L2DATA(cdev), toep->tp_l2t); 3085 3086 /* 3087 * Unhook 3088 */ 3089 toep->tp_tp->t_toe = NULL; 3090 toep->tp_tp->t_flags &= ~TF_TOE; 3091 toep->tp_tp = NULL; 3092 /* 3093 * XXX need to call syncache_chkrst - but we don't 3094 * have a way of doing that yet 3095 */ 3096 toepcb_release(toep); 3097 log(LOG_ERR, "abort for unestablished connection :-(\n"); 3098 return (0); 3099 } 3100 if (toep->tp_tp == NULL) { 3101 log(LOG_NOTICE, "disconnected toepcb\n"); 3102 /* should be freed momentarily */ 3103 return (0); 3104 } 3105 3106 3107 toepcb_hold(toep); 3108 process_abort_req(toep, m, toep->tp_toedev); 3109 toepcb_release(toep); 3110 return (0); 3111} 3112#ifdef notyet 3113static void 3114pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m) 3115{ 3116 struct toedev *tdev = TOE_DEV(parent); 3117 3118 do_abort_syn_rcv(child, parent); 3119 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) { 3120 struct cpl_pass_accept_rpl *rpl = cplhdr(m); 3121 3122 rpl->opt0h = htonl(F_TCAM_BYPASS); 3123 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 3124 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3125 } else 3126 m_free(m); 3127} 3128#endif 3129static void 3130handle_pass_open_arp_failure(struct socket *so, struct mbuf *m) 3131{ 3132 CXGB_UNIMPLEMENTED(); 3133 3134#ifdef notyet 3135 struct t3cdev *cdev; 3136 struct socket *parent; 3137 struct socket *oreq; 3138 struct t3c_tid_entry *t3c_stid; 3139 struct tid_info *t; 3140 struct tcpcb *otp, *tp = so_sototcpcb(so); 3141 struct toepcb *toep = tp->t_toe; 3142 3143 /* 3144 * If the connection is being aborted due to the parent listening 3145 * socket going away there's nothing to do, the ABORT_REQ will close 3146 * the connection. 3147 */ 3148 if (toep->tp_flags & TP_ABORT_RPL_PENDING) { 3149 m_free(m); 3150 return; 3151 } 3152 3153 oreq = so->so_incomp; 3154 otp = so_sototcpcb(oreq); 3155 3156 cdev = T3C_DEV(so); 3157 t = &(T3C_DATA(cdev))->tid_maps; 3158 t3c_stid = lookup_stid(t, otp->ts_recent); 3159 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso; 3160 3161 so_lock(parent); 3162 pass_open_abort(so, parent, m); 3163 so_unlock(parent); 3164#endif 3165} 3166 3167/* 3168 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly 3169 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV 3170 * connection. 3171 */ 3172static void 3173pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m) 3174{ 3175 3176#ifdef notyet 3177 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 3178 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk); 3179#endif 3180 handle_pass_open_arp_failure(m_get_socket(m), m); 3181} 3182 3183/* 3184 * Populate a reject CPL_PASS_ACCEPT_RPL WR. 3185 */ 3186static void 3187mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf) 3188{ 3189 struct cpl_pass_accept_req *req = cplhdr(req_mbuf); 3190 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf); 3191 unsigned int tid = GET_TID(req); 3192 3193 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP); 3194 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 3195 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 3196 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet 3197 rpl->opt0h = htonl(F_TCAM_BYPASS); 3198 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); 3199 rpl->opt2 = 0; 3200 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 3201} 3202 3203/* 3204 * Send a deferred reject to an accept request. 3205 */ 3206static void 3207reject_pass_request(struct toedev *tdev, struct mbuf *m) 3208{ 3209 struct mbuf *reply_mbuf; 3210 3211 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl)); 3212 mk_pass_accept_rpl(reply_mbuf, m); 3213 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf); 3214 m_free(m); 3215} 3216 3217static void 3218handle_syncache_event(int event, void *arg) 3219{ 3220 struct toepcb *toep = arg; 3221 3222 switch (event) { 3223 case TOE_SC_ENTRY_PRESENT: 3224 /* 3225 * entry already exists - free toepcb 3226 * and l2t 3227 */ 3228 printf("syncache entry present\n"); 3229 toepcb_release(toep); 3230 break; 3231 case TOE_SC_DROP: 3232 /* 3233 * The syncache has given up on this entry 3234 * either it timed out, or it was evicted 3235 * we need to explicitly release the tid 3236 */ 3237 printf("syncache entry dropped\n"); 3238 toepcb_release(toep); 3239 break; 3240 default: 3241 log(LOG_ERR, "unknown syncache event %d\n", event); 3242 break; 3243 } 3244} 3245 3246static void 3247syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep) 3248{ 3249 struct in_conninfo inc; 3250 struct tcpopt to; 3251 struct tcphdr th; 3252 struct inpcb *inp; 3253 int mss, wsf, sack, ts; 3254 uint32_t rcv_isn = ntohl(req->rcv_isn); 3255 3256 bzero(&to, sizeof(struct tcpopt)); 3257 inp = so_sotoinpcb(lso); 3258 3259 /* 3260 * Fill out information for entering us into the syncache 3261 */ 3262 bzero(&inc, sizeof(inc)); 3263 inc.inc_fport = th.th_sport = req->peer_port; 3264 inc.inc_lport = th.th_dport = req->local_port; 3265 th.th_seq = req->rcv_isn; 3266 th.th_flags = TH_SYN; 3267 3268 toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1; 3269 3270 3271 inc.inc_isipv6 = 0; 3272 inc.inc_len = 0; 3273 inc.inc_faddr.s_addr = req->peer_ip; 3274 inc.inc_laddr.s_addr = req->local_ip; 3275 3276 DPRINTF("syncache add of %d:%d %d:%d\n", 3277 ntohl(req->local_ip), ntohs(req->local_port), 3278 ntohl(req->peer_ip), ntohs(req->peer_port)); 3279 3280 mss = req->tcp_options.mss; 3281 wsf = req->tcp_options.wsf; 3282 ts = req->tcp_options.tstamp; 3283 sack = req->tcp_options.sack; 3284 to.to_mss = mss; 3285 to.to_wscale = wsf; 3286 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 3287 tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep); 3288} 3289 3290 3291/* 3292 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket 3293 * lock held. Note that the sock here is a listening socket that is not owned 3294 * by the TOE. 3295 */ 3296static void 3297process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev, 3298 struct listen_ctx *lctx) 3299{ 3300 int rt_flags; 3301 struct l2t_entry *e; 3302 struct iff_mac tim; 3303 struct mbuf *reply_mbuf, *ddp_mbuf = NULL; 3304 struct cpl_pass_accept_rpl *rpl; 3305 struct cpl_pass_accept_req *req = cplhdr(m); 3306 unsigned int tid = GET_TID(req); 3307 struct tom_data *d = TOM_DATA(tdev); 3308 struct t3cdev *cdev = d->cdev; 3309 struct tcpcb *tp = so_sototcpcb(so); 3310 struct toepcb *newtoep; 3311 struct rtentry *dst; 3312 struct sockaddr_in nam; 3313 struct t3c_data *td = T3C_DATA(cdev); 3314 3315 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 3316 if (__predict_false(reply_mbuf == NULL)) { 3317 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 3318 t3_defer_reply(m, tdev, reject_pass_request); 3319 else { 3320 cxgb_queue_tid_release(cdev, tid); 3321 m_free(m); 3322 } 3323 DPRINTF("failed to get reply_mbuf\n"); 3324 3325 goto out; 3326 } 3327 3328 if (tp->t_state != TCPS_LISTEN) { 3329 DPRINTF("socket not in listen state\n"); 3330 3331 goto reject; 3332 } 3333 3334 tim.mac_addr = req->dst_mac; 3335 tim.vlan_tag = ntohs(req->vlan_tag); 3336 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) { 3337 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n"); 3338 goto reject; 3339 } 3340 3341#ifdef notyet 3342 /* 3343 * XXX do route lookup to confirm that we're still listening on this 3344 * address 3345 */ 3346 if (ip_route_input(skb, req->local_ip, req->peer_ip, 3347 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev)) 3348 goto reject; 3349 rt_flags = ((struct rtable *)skb->dst)->rt_flags & 3350 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); 3351 dst_release(skb->dst); // done with the input route, release it 3352 skb->dst = NULL; 3353 3354 if ((rt_flags & RTF_LOCAL) == 0) 3355 goto reject; 3356#endif 3357 /* 3358 * XXX 3359 */ 3360 rt_flags = RTF_LOCAL; 3361 if ((rt_flags & RTF_LOCAL) == 0) 3362 goto reject; 3363 3364 /* 3365 * Calculate values and add to syncache 3366 */ 3367 3368 newtoep = toepcb_alloc(); 3369 if (newtoep == NULL) 3370 goto reject; 3371 3372 bzero(&nam, sizeof(struct sockaddr_in)); 3373 3374 nam.sin_len = sizeof(struct sockaddr_in); 3375 nam.sin_family = AF_INET; 3376 nam.sin_addr.s_addr =req->peer_ip; 3377 dst = rtalloc2((struct sockaddr *)&nam, 1, 0); 3378 3379 if (dst == NULL) { 3380 printf("failed to find route\n"); 3381 goto reject; 3382 } 3383 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev, 3384 (struct sockaddr *)&nam); 3385 if (e == NULL) { 3386 DPRINTF("failed to get l2t\n"); 3387 } 3388 /* 3389 * Point to our listen socket until accept 3390 */ 3391 newtoep->tp_tp = tp; 3392 newtoep->tp_flags = TP_SYN_RCVD; 3393 newtoep->tp_tid = tid; 3394 newtoep->tp_toedev = tdev; 3395 tp->rcv_wnd = select_rcv_wnd(tdev, so); 3396 3397 cxgb_insert_tid(cdev, d->client, newtoep, tid); 3398 so_lock(so); 3399 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry); 3400 so_unlock(so); 3401 3402 newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) && 3403 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0; 3404 3405 if (newtoep->tp_ulp_mode) { 3406 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA); 3407 3408 if (ddp_mbuf == NULL) 3409 newtoep->tp_ulp_mode = 0; 3410 } 3411 3412 CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d", 3413 TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode); 3414 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure); 3415 /* 3416 * XXX workaround for lack of syncache drop 3417 */ 3418 toepcb_hold(newtoep); 3419 syncache_add_accept_req(req, so, newtoep); 3420 3421 rpl = cplhdr(reply_mbuf); 3422 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl); 3423 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); 3424 rpl->wr.wr_lo = 0; 3425 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); 3426 rpl->opt2 = htonl(calc_opt2(so, tdev)); 3427 rpl->rsvd = rpl->opt2; /* workaround for HW bug */ 3428 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten 3429 3430 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) | 3431 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx)); 3432 rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) | 3433 CPL_PASS_OPEN_ACCEPT); 3434 3435 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status); 3436 3437 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep)); 3438 3439 l2t_send(cdev, reply_mbuf, e); 3440 m_free(m); 3441 if (newtoep->tp_ulp_mode) { 3442 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS, 3443 V_TF_DDP_OFF(1) | 3444 TP_DDP_TIMER_WORKAROUND_MASK, 3445 V_TF_DDP_OFF(1) | 3446 TP_DDP_TIMER_WORKAROUND_VAL, 1); 3447 } else 3448 printf("not offloading\n"); 3449 3450 3451 3452 return; 3453reject: 3454 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) 3455 mk_pass_accept_rpl(reply_mbuf, m); 3456 else 3457 mk_tid_release(reply_mbuf, newtoep, tid); 3458 cxgb_ofld_send(cdev, reply_mbuf); 3459 m_free(m); 3460out: 3461#if 0 3462 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); 3463#else 3464 return; 3465#endif 3466} 3467 3468/* 3469 * Handle a CPL_PASS_ACCEPT_REQ message. 3470 */ 3471static int 3472do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3473{ 3474 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; 3475 struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */ 3476 struct tom_data *d = listen_ctx->tom_data; 3477 3478#if VALIDATE_TID 3479 struct cpl_pass_accept_req *req = cplhdr(m); 3480 unsigned int tid = GET_TID(req); 3481 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; 3482 3483 if (unlikely(!lsk)) { 3484 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n", 3485 cdev->name, 3486 (unsigned long)((union listen_entry *)ctx - 3487 t->stid_tab)); 3488 return CPL_RET_BUF_DONE; 3489 } 3490 if (unlikely(tid >= t->ntids)) { 3491 printk(KERN_ERR "%s: passive open TID %u too large\n", 3492 cdev->name, tid); 3493 return CPL_RET_BUF_DONE; 3494 } 3495 /* 3496 * For T3A the current user of the TID may have closed but its last 3497 * message(s) may have been backlogged so the TID appears to be still 3498 * in use. Just take the TID away, the connection can close at its 3499 * own leisure. For T3B this situation is a bug. 3500 */ 3501 if (!valid_new_tid(t, tid) && 3502 cdev->type != T3A) { 3503 printk(KERN_ERR "%s: passive open uses existing TID %u\n", 3504 cdev->name, tid); 3505 return CPL_RET_BUF_DONE; 3506 } 3507#endif 3508 3509 process_pass_accept_req(lso, m, &d->tdev, listen_ctx); 3510 return (0); 3511} 3512 3513/* 3514 * Called when a connection is established to translate the TCP options 3515 * reported by HW to FreeBSD's native format. 3516 */ 3517static void 3518assign_rxopt(struct socket *so, unsigned int opt) 3519{ 3520 struct tcpcb *tp = so_sototcpcb(so); 3521 struct toepcb *toep = tp->t_toe; 3522 const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep)); 3523 3524 inp_lock_assert(tp->t_inpcb); 3525 3526 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40; 3527 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0; 3528 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0; 3529 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0; 3530 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 3531 (TF_RCVD_SCALE|TF_REQ_SCALE)) 3532 tp->rcv_scale = tp->request_r_scale; 3533} 3534 3535/* 3536 * Completes some final bits of initialization for just established connections 3537 * and changes their state to TCP_ESTABLISHED. 3538 * 3539 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. 3540 */ 3541static void 3542make_established(struct socket *so, u32 snd_isn, unsigned int opt) 3543{ 3544 struct tcpcb *tp = so_sototcpcb(so); 3545 struct toepcb *toep = tp->t_toe; 3546 3547 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn; 3548 assign_rxopt(so, opt); 3549 3550 /* 3551 *XXXXXXXXXXX 3552 * 3553 */ 3554#ifdef notyet 3555 so->so_proto->pr_ctloutput = t3_ctloutput; 3556#endif 3557 3558#if 0 3559 inet_sk(sk)->id = tp->write_seq ^ jiffies; 3560#endif 3561 /* 3562 * XXX not clear what rcv_wup maps to 3563 */ 3564 /* 3565 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't 3566 * pass through opt0. 3567 */ 3568 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) 3569 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); 3570 3571 dump_toepcb(toep); 3572 3573#ifdef notyet 3574/* 3575 * no clean interface for marking ARP up to date 3576 */ 3577 dst_confirm(sk->sk_dst_cache); 3578#endif 3579 tp->t_starttime = ticks; 3580 tp->t_state = TCPS_ESTABLISHED; 3581 soisconnected(so); 3582} 3583 3584static int 3585syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep) 3586{ 3587 3588 struct in_conninfo inc; 3589 struct tcpopt to; 3590 struct tcphdr th; 3591 int mss, wsf, sack, ts; 3592 struct mbuf *m = NULL; 3593 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev); 3594 unsigned int opt; 3595 3596#ifdef MAC 3597#error "no MAC support" 3598#endif 3599 3600 opt = ntohs(req->tcp_opt); 3601 3602 bzero(&to, sizeof(struct tcpopt)); 3603 3604 /* 3605 * Fill out information for entering us into the syncache 3606 */ 3607 bzero(&inc, sizeof(inc)); 3608 inc.inc_fport = th.th_sport = req->peer_port; 3609 inc.inc_lport = th.th_dport = req->local_port; 3610 th.th_seq = req->rcv_isn; 3611 th.th_flags = TH_ACK; 3612 3613 inc.inc_isipv6 = 0; 3614 inc.inc_len = 0; 3615 inc.inc_faddr.s_addr = req->peer_ip; 3616 inc.inc_laddr.s_addr = req->local_ip; 3617 3618 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40; 3619 wsf = G_TCPOPT_WSCALE_OK(opt); 3620 ts = G_TCPOPT_TSTAMP(opt); 3621 sack = G_TCPOPT_SACK(opt); 3622 3623 to.to_mss = mss; 3624 to.to_wscale = G_TCPOPT_SND_WSCALE(opt); 3625 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0); 3626 3627 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n", 3628 ntohl(req->local_ip), ntohs(req->local_port), 3629 ntohl(req->peer_ip), ntohs(req->peer_port), 3630 mss, wsf, ts, sack); 3631 return tcp_offload_syncache_expand(&inc, &to, &th, so, m); 3632} 3633 3634 3635/* 3636 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work 3637 * if we are in TCP_SYN_RECV due to crossed SYNs 3638 */ 3639static int 3640do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3641{ 3642 struct cpl_pass_establish *req = cplhdr(m); 3643 struct toepcb *toep = (struct toepcb *)ctx; 3644 struct tcpcb *tp = toep->tp_tp; 3645 struct socket *so, *lso; 3646 struct t3c_data *td = T3C_DATA(cdev); 3647 struct sockbuf *snd, *rcv; 3648 3649 // Complete socket initialization now that we have the SND_ISN 3650 3651 struct toedev *tdev; 3652 3653 3654 tdev = toep->tp_toedev; 3655 3656 inp_wlock(tp->t_inpcb); 3657 3658 /* 3659 * 3660 * XXX need to add reference while we're manipulating 3661 */ 3662 so = lso = inp_inpcbtosocket(tp->t_inpcb); 3663 3664 inp_wunlock(tp->t_inpcb); 3665 3666 so_lock(so); 3667 LIST_REMOVE(toep, synq_entry); 3668 so_unlock(so); 3669 3670 if (!syncache_expand_establish_req(req, &so, toep)) { 3671 /* 3672 * No entry 3673 */ 3674 CXGB_UNIMPLEMENTED(); 3675 } 3676 if (so == NULL) { 3677 /* 3678 * Couldn't create the socket 3679 */ 3680 CXGB_UNIMPLEMENTED(); 3681 } 3682 3683 tp = so_sototcpcb(so); 3684 inp_wlock(tp->t_inpcb); 3685 3686 snd = so_sockbuf_snd(so); 3687 rcv = so_sockbuf_rcv(so); 3688 3689 snd->sb_flags |= SB_NOCOALESCE; 3690 rcv->sb_flags |= SB_NOCOALESCE; 3691 3692 toep->tp_tp = tp; 3693 toep->tp_flags = 0; 3694 tp->t_toe = toep; 3695 reset_wr_list(toep); 3696 tp->rcv_wnd = select_rcv_wnd(tdev, so); 3697 tp->rcv_nxt = toep->tp_copied_seq; 3698 install_offload_ops(so); 3699 3700 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs); 3701 toep->tp_wr_unacked = 0; 3702 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3703 toep->tp_qset_idx = 0; 3704 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu); 3705 3706 /* 3707 * XXX Cancel any keep alive timer 3708 */ 3709 3710 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3711 3712 /* 3713 * XXX workaround for lack of syncache drop 3714 */ 3715 toepcb_release(toep); 3716 inp_wunlock(tp->t_inpcb); 3717 3718 CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid); 3719 cxgb_log_tcb(cdev->adapter, toep->tp_tid); 3720#ifdef notyet 3721 /* 3722 * XXX not sure how these checks map to us 3723 */ 3724 if (unlikely(sk->sk_socket)) { // simultaneous opens only 3725 sk->sk_state_change(sk); 3726 sk_wake_async(so, 0, POLL_OUT); 3727 } 3728 /* 3729 * The state for the new connection is now up to date. 3730 * Next check if we should add the connection to the parent's 3731 * accept queue. When the parent closes it resets connections 3732 * on its SYN queue, so check if we are being reset. If so we 3733 * don't need to do anything more, the coming ABORT_RPL will 3734 * destroy this socket. Otherwise move the connection to the 3735 * accept queue. 3736 * 3737 * Note that we reset the synq before closing the server so if 3738 * we are not being reset the stid is still open. 3739 */ 3740 if (unlikely(!tp->forward_skb_hint)) { // removed from synq 3741 __kfree_skb(skb); 3742 goto unlock; 3743 } 3744#endif 3745 m_free(m); 3746 3747 return (0); 3748} 3749 3750/* 3751 * Fill in the right TID for CPL messages waiting in the out-of-order queue 3752 * and send them to the TOE. 3753 */ 3754static void 3755fixup_and_send_ofo(struct toepcb *toep) 3756{ 3757 struct mbuf *m; 3758 struct toedev *tdev = toep->tp_toedev; 3759 struct tcpcb *tp = toep->tp_tp; 3760 unsigned int tid = toep->tp_tid; 3761 3762 log(LOG_NOTICE, "fixup_and_send_ofo\n"); 3763 3764 inp_lock_assert(tp->t_inpcb); 3765 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) { 3766 /* 3767 * A variety of messages can be waiting but the fields we'll 3768 * be touching are common to all so any message type will do. 3769 */ 3770 struct cpl_close_con_req *p = cplhdr(m); 3771 3772 p->wr.wr_lo = htonl(V_WR_TID(tid)); 3773 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); 3774 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m); 3775 } 3776} 3777 3778/* 3779 * Updates socket state from an active establish CPL message. Runs with the 3780 * socket lock held. 3781 */ 3782static void 3783socket_act_establish(struct socket *so, struct mbuf *m) 3784{ 3785 struct cpl_act_establish *req = cplhdr(m); 3786 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ 3787 struct tcpcb *tp = so_sototcpcb(so); 3788 struct toepcb *toep = tp->t_toe; 3789 3790 if (__predict_false(tp->t_state != TCPS_SYN_SENT)) 3791 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n", 3792 toep->tp_tid, tp->t_state); 3793 3794 tp->ts_recent_age = ticks; 3795 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn; 3796 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs; 3797 3798 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt)); 3799 3800 /* 3801 * Now that we finally have a TID send any CPL messages that we had to 3802 * defer for lack of a TID. 3803 */ 3804 if (mbufq_len(&toep->out_of_order_queue)) 3805 fixup_and_send_ofo(toep); 3806 3807 if (__predict_false(so_state_get(so) & SS_NOFDREF)) { 3808 /* 3809 * XXX does this even make sense? 3810 */ 3811 so_sorwakeup(so); 3812 } 3813 m_free(m); 3814#ifdef notyet 3815/* 3816 * XXX assume no write requests permitted while socket connection is 3817 * incomplete 3818 */ 3819 /* 3820 * Currently the send queue must be empty at this point because the 3821 * socket layer does not send anything before a connection is 3822 * established. To be future proof though we handle the possibility 3823 * that there are pending buffers to send (either TX_DATA or 3824 * CLOSE_CON_REQ). First we need to adjust the sequence number of the 3825 * buffers according to the just learned write_seq, and then we send 3826 * them on their way. 3827 */ 3828 fixup_pending_writeq_buffers(sk); 3829 if (t3_push_frames(so, 1)) 3830 sk->sk_write_space(sk); 3831#endif 3832 3833 toep->tp_state = tp->t_state; 3834 V_tcpstat.tcps_connects++; 3835 3836} 3837 3838/* 3839 * Process a CPL_ACT_ESTABLISH message. 3840 */ 3841static int 3842do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx) 3843{ 3844 struct cpl_act_establish *req = cplhdr(m); 3845 unsigned int tid = GET_TID(req); 3846 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); 3847 struct toepcb *toep = (struct toepcb *)ctx; 3848 struct tcpcb *tp = toep->tp_tp; 3849 struct socket *so; 3850 struct toedev *tdev; 3851 struct tom_data *d; 3852 3853 if (tp == NULL) { 3854 free_atid(cdev, atid); 3855 return (0); 3856 } 3857 inp_wlock(tp->t_inpcb); 3858 3859 /* 3860 * XXX 3861 */ 3862 so = inp_inpcbtosocket(tp->t_inpcb); 3863 tdev = toep->tp_toedev; /* blow up here if link was down */ 3864 d = TOM_DATA(tdev); 3865 3866 /* 3867 * It's OK if the TID is currently in use, the owning socket may have 3868 * backlogged its last CPL message(s). Just take it away. 3869 */ 3870 toep->tp_tid = tid; 3871 toep->tp_tp = tp; 3872 so_insert_tid(d, toep, tid); 3873 free_atid(cdev, atid); 3874 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data)); 3875 3876 socket_act_establish(so, m); 3877 inp_wunlock(tp->t_inpcb); 3878 CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid); 3879 cxgb_log_tcb(cdev->adapter, toep->tp_tid); 3880 3881 return (0); 3882} 3883 3884/* 3885 * Process an acknowledgment of WR completion. Advance snd_una and send the 3886 * next batch of work requests from the write queue. 3887 */ 3888static void 3889wr_ack(struct toepcb *toep, struct mbuf *m) 3890{ 3891 struct tcpcb *tp = toep->tp_tp; 3892 struct cpl_wr_ack *hdr = cplhdr(m); 3893 struct socket *so; 3894 unsigned int credits = ntohs(hdr->credits); 3895 u32 snd_una = ntohl(hdr->snd_una); 3896 int bytes = 0; 3897 struct sockbuf *snd; 3898 3899 CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits); 3900 3901 inp_wlock(tp->t_inpcb); 3902 so = inp_inpcbtosocket(tp->t_inpcb); 3903 toep->tp_wr_avail += credits; 3904 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail) 3905 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail; 3906 3907 while (credits) { 3908 struct mbuf *p = peek_wr(toep); 3909 3910 if (__predict_false(!p)) { 3911 log(LOG_ERR, "%u WR_ACK credits for TID %u with " 3912 "nothing pending, state %u wr_avail=%u\n", 3913 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail); 3914 break; 3915 } 3916 CTR2(KTR_TOM, 3917 "wr_ack: p->credits=%d p->bytes=%d", 3918 p->m_pkthdr.csum_data, p->m_pkthdr.len); 3919 KASSERT(p->m_pkthdr.csum_data != 0, 3920 ("empty request still on list")); 3921 3922 if (__predict_false(credits < p->m_pkthdr.csum_data)) { 3923 3924#if DEBUG_WR > 1 3925 struct tx_data_wr *w = cplhdr(p); 3926 log(LOG_ERR, 3927 "TID %u got %u WR credits, need %u, len %u, " 3928 "main body %u, frags %u, seq # %u, ACK una %u," 3929 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", 3930 toep->tp_tid, credits, p->csum, p->len, 3931 p->len - p->data_len, skb_shinfo(p)->nr_frags, 3932 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), 3933 toep->tp_wr_avail, count_pending_wrs(tp) - credits); 3934#endif 3935 p->m_pkthdr.csum_data -= credits; 3936 break; 3937 } else { 3938 dequeue_wr(toep); 3939 credits -= p->m_pkthdr.csum_data; 3940 bytes += p->m_pkthdr.len; 3941 CTR3(KTR_TOM, 3942 "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d", 3943 p->m_pkthdr.len, credits, p->m_pkthdr.csum_data); 3944 3945 m_free(p); 3946 } 3947 } 3948 3949#if DEBUG_WR 3950 check_wr_invariants(tp); 3951#endif 3952 3953 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) { 3954#if VALIDATE_SEQ 3955 struct tom_data *d = TOM_DATA(TOE_DEV(so)); 3956 3957 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK " 3958 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una, 3959 toep->tp_tid, tp->snd_una); 3960#endif 3961 goto out_free; 3962 } 3963 3964 if (tp->snd_una != snd_una) { 3965 tp->snd_una = snd_una; 3966 tp->ts_recent_age = ticks; 3967#ifdef notyet 3968 /* 3969 * Keep ARP entry "minty fresh" 3970 */ 3971 dst_confirm(sk->sk_dst_cache); 3972#endif 3973 if (tp->snd_una == tp->snd_nxt) 3974 toep->tp_flags &= ~TP_TX_WAIT_IDLE; 3975 } 3976 3977 snd = so_sockbuf_snd(so); 3978 if (bytes) { 3979 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes); 3980 snd = so_sockbuf_snd(so); 3981 sockbuf_lock(snd); 3982 sbdrop_locked(snd, bytes); 3983 so_sowwakeup_locked(so); 3984 } 3985 3986 if (snd->sb_sndptroff < snd->sb_cc) 3987 t3_push_frames(so, 0); 3988 3989out_free: 3990 inp_wunlock(tp->t_inpcb); 3991 m_free(m); 3992} 3993 3994/* 3995 * Handler for TX_DATA_ACK CPL messages. 3996 */ 3997static int 3998do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx) 3999{ 4000 struct toepcb *toep = (struct toepcb *)ctx; 4001 4002 VALIDATE_SOCK(so); 4003 4004 wr_ack(toep, m); 4005 return 0; 4006} 4007 4008/* 4009 * Handler for TRACE_PKT CPL messages. Just sink these packets. 4010 */ 4011static int 4012do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx) 4013{ 4014 m_freem(m); 4015 return 0; 4016} 4017 4018/* 4019 * Reset a connection that is on a listener's SYN queue or accept queue, 4020 * i.e., one that has not had a struct socket associated with it. 4021 * Must be called from process context. 4022 * 4023 * Modeled after code in inet_csk_listen_stop(). 4024 */ 4025static void 4026t3_reset_listen_child(struct socket *child) 4027{ 4028 struct tcpcb *tp = so_sototcpcb(child); 4029 4030 t3_send_reset(tp->t_toe); 4031} 4032 4033 4034static void 4035t3_child_disconnect(struct socket *so, void *arg) 4036{ 4037 struct tcpcb *tp = so_sototcpcb(so); 4038 4039 if (tp->t_flags & TF_TOE) { 4040 inp_wlock(tp->t_inpcb); 4041 t3_reset_listen_child(so); 4042 inp_wunlock(tp->t_inpcb); 4043 } 4044} 4045 4046/* 4047 * Disconnect offloaded established but not yet accepted connections sitting 4048 * on a server's accept_queue. We just send an ABORT_REQ at this point and 4049 * finish off the disconnect later as we may need to wait for the ABORT_RPL. 4050 */ 4051void 4052t3_disconnect_acceptq(struct socket *listen_so) 4053{ 4054 4055 so_lock(listen_so); 4056 so_listeners_apply_all(listen_so, t3_child_disconnect, NULL); 4057 so_unlock(listen_so); 4058} 4059 4060/* 4061 * Reset offloaded connections sitting on a server's syn queue. As above 4062 * we send ABORT_REQ and finish off when we get ABORT_RPL. 4063 */ 4064 4065void 4066t3_reset_synq(struct listen_ctx *lctx) 4067{ 4068 struct toepcb *toep; 4069 4070 so_lock(lctx->lso); 4071 while (!LIST_EMPTY(&lctx->synq_head)) { 4072 toep = LIST_FIRST(&lctx->synq_head); 4073 LIST_REMOVE(toep, synq_entry); 4074 toep->tp_tp = NULL; 4075 t3_send_reset(toep); 4076 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid); 4077 toepcb_release(toep); 4078 } 4079 so_unlock(lctx->lso); 4080} 4081 4082 4083int 4084t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl, 4085 unsigned int nppods, unsigned int tag, unsigned int maxoff, 4086 unsigned int pg_off, unsigned int color) 4087{ 4088 unsigned int i, j, pidx; 4089 struct pagepod *p; 4090 struct mbuf *m; 4091 struct ulp_mem_io *req; 4092 unsigned int tid = toep->tp_tid; 4093 const struct tom_data *td = TOM_DATA(toep->tp_toedev); 4094 unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit; 4095 4096 CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)", 4097 gl, nppods, tag, maxoff, pg_off, color); 4098 4099 for (i = 0; i < nppods; ++i) { 4100 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE); 4101 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4102 req = mtod(m, struct ulp_mem_io *); 4103 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE; 4104 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4105 req->wr.wr_lo = 0; 4106 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) | 4107 V_ULPTX_CMD(ULP_MEM_WRITE)); 4108 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) | 4109 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1)); 4110 4111 p = (struct pagepod *)(req + 1); 4112 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) { 4113 p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid)); 4114 p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) | 4115 V_PPOD_COLOR(color)); 4116 p->pp_max_offset = htonl(maxoff); 4117 p->pp_page_offset = htonl(pg_off); 4118 p->pp_rsvd = 0; 4119 for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx) 4120 p->pp_addr[j] = pidx < gl->dgl_nelem ? 4121 htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0; 4122 } else 4123 p->pp_vld_tid = 0; /* mark sentinel page pods invalid */ 4124 send_or_defer(toep, m, 0); 4125 ppod_addr += PPOD_SIZE; 4126 } 4127 return (0); 4128} 4129 4130/* 4131 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command. 4132 */ 4133static inline void 4134mk_cpl_barrier_ulp(struct cpl_barrier *b) 4135{ 4136 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b; 4137 4138 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4139 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8)); 4140 b->opcode = CPL_BARRIER; 4141} 4142 4143/* 4144 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command. 4145 */ 4146static inline void 4147mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno) 4148{ 4149 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; 4150 4151 txpkt = (struct ulp_txpkt *)req; 4152 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4153 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); 4154 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid)); 4155 req->cpuno = htons(cpuno); 4156} 4157 4158/* 4159 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command. 4160 */ 4161static inline void 4162mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid, 4163 unsigned int word, uint64_t mask, uint64_t val) 4164{ 4165 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; 4166 4167 CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx", 4168 tid, word, mask, val); 4169 4170 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4171 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); 4172 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); 4173 req->reply = V_NO_REPLY(1); 4174 req->cpu_idx = 0; 4175 req->word = htons(word); 4176 req->mask = htobe64(mask); 4177 req->val = htobe64(val); 4178} 4179 4180/* 4181 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command. 4182 */ 4183static void 4184mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack, 4185 unsigned int tid, unsigned int credits) 4186{ 4187 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack; 4188 4189 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); 4190 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8)); 4191 OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid)); 4192 ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | 4193 V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) | 4194 V_RX_CREDITS(credits)); 4195} 4196 4197void 4198t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx) 4199{ 4200 unsigned int wrlen; 4201 struct mbuf *m; 4202 struct work_request_hdr *wr; 4203 struct cpl_barrier *lock; 4204 struct cpl_set_tcb_field *req; 4205 struct cpl_get_tcb *getreq; 4206 struct ddp_state *p = &toep->tp_ddp_state; 4207 4208#if 0 4209 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4210#endif 4211 wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) + 4212 sizeof(*getreq); 4213 m = m_gethdr_nofail(wrlen); 4214 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4215 wr = mtod(m, struct work_request_hdr *); 4216 bzero(wr, wrlen); 4217 4218 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4219 m->m_pkthdr.len = m->m_len = wrlen; 4220 4221 lock = (struct cpl_barrier *)(wr + 1); 4222 mk_cpl_barrier_ulp(lock); 4223 4224 req = (struct cpl_set_tcb_field *)(lock + 1); 4225 4226 CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx); 4227 4228 /* Hmmm, not sure if this actually a good thing: reactivating 4229 * the other buffer might be an issue if it has been completed 4230 * already. However, that is unlikely, since the fact that the UBUF 4231 * is not completed indicates that there is no oustanding data. 4232 */ 4233 if (bufidx == 0) 4234 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4235 V_TF_DDP_ACTIVE_BUF(1) | 4236 V_TF_DDP_BUF0_VALID(1), 4237 V_TF_DDP_ACTIVE_BUF(1)); 4238 else 4239 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4240 V_TF_DDP_ACTIVE_BUF(1) | 4241 V_TF_DDP_BUF1_VALID(1), 0); 4242 4243 getreq = (struct cpl_get_tcb *)(req + 1); 4244 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); 4245 4246 mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1)); 4247 4248 /* Keep track of the number of oustanding CPL_GET_TCB requests 4249 */ 4250 p->get_tcb_count++; 4251 4252#ifdef T3_TRACE 4253 T3_TRACE1(TIDTB(so), 4254 "t3_cancel_ddpbuf: bufidx %u", bufidx); 4255#endif 4256 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4257} 4258 4259/** 4260 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one 4261 * @sk: the socket associated with the buffers 4262 * @bufidx: index of HW DDP buffer (0 or 1) 4263 * @tag0: new tag for HW buffer 0 4264 * @tag1: new tag for HW buffer 1 4265 * @len: new length for HW buf @bufidx 4266 * 4267 * Sends a compound WR to overlay a new DDP buffer on top of an existing 4268 * buffer by changing the buffer tag and length and setting the valid and 4269 * active flag accordingly. The caller must ensure the new buffer is at 4270 * least as big as the existing one. Since we typically reprogram both HW 4271 * buffers this function sets both tags for convenience. Read the TCB to 4272 * determine how made data was written into the buffer before the overlay 4273 * took place. 4274 */ 4275void 4276t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0, 4277 unsigned int tag1, unsigned int len) 4278{ 4279 unsigned int wrlen; 4280 struct mbuf *m; 4281 struct work_request_hdr *wr; 4282 struct cpl_get_tcb *getreq; 4283 struct cpl_set_tcb_field *req; 4284 struct ddp_state *p = &toep->tp_ddp_state; 4285 4286 CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)", 4287 bufidx, tag0, tag1, len); 4288#if 0 4289 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4290#endif 4291 wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq); 4292 m = m_gethdr_nofail(wrlen); 4293 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4294 wr = mtod(m, struct work_request_hdr *); 4295 m->m_pkthdr.len = m->m_len = wrlen; 4296 bzero(wr, wrlen); 4297 4298 4299 /* Set the ATOMIC flag to make sure that TP processes the following 4300 * CPLs in an atomic manner and no wire segments can be interleaved. 4301 */ 4302 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC); 4303 req = (struct cpl_set_tcb_field *)(wr + 1); 4304 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG, 4305 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) | 4306 V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32, 4307 V_TCB_RX_DDP_BUF0_TAG(tag0) | 4308 V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32); 4309 req++; 4310 if (bufidx == 0) { 4311 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN, 4312 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 4313 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len)); 4314 req++; 4315 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4316 V_TF_DDP_PUSH_DISABLE_0(1) | 4317 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 4318 V_TF_DDP_PUSH_DISABLE_0(0) | 4319 V_TF_DDP_BUF0_VALID(1)); 4320 } else { 4321 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN, 4322 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN), 4323 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len)); 4324 req++; 4325 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, 4326 V_TF_DDP_PUSH_DISABLE_1(1) | 4327 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 4328 V_TF_DDP_PUSH_DISABLE_1(0) | 4329 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1)); 4330 } 4331 4332 getreq = (struct cpl_get_tcb *)(req + 1); 4333 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset); 4334 4335 /* Keep track of the number of oustanding CPL_GET_TCB requests 4336 */ 4337 p->get_tcb_count++; 4338 4339#ifdef T3_TRACE 4340 T3_TRACE4(TIDTB(sk), 4341 "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u " 4342 "len %d", 4343 bufidx, tag0, tag1, len); 4344#endif 4345 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4346} 4347 4348/* 4349 * Sends a compound WR containing all the CPL messages needed to program the 4350 * two HW DDP buffers, namely optionally setting up the length and offset of 4351 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK. 4352 */ 4353void 4354t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0, 4355 unsigned int len1, unsigned int offset1, 4356 uint64_t ddp_flags, uint64_t flag_mask, int modulate) 4357{ 4358 unsigned int wrlen; 4359 struct mbuf *m; 4360 struct work_request_hdr *wr; 4361 struct cpl_set_tcb_field *req; 4362 4363 CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ", 4364 len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff); 4365 4366#if 0 4367 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv); 4368#endif 4369 wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) + 4370 (len1 ? sizeof(*req) : 0) + 4371 (modulate ? sizeof(struct cpl_rx_data_ack) : 0); 4372 m = m_gethdr_nofail(wrlen); 4373 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep)); 4374 wr = mtod(m, struct work_request_hdr *); 4375 bzero(wr, wrlen); 4376 4377 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); 4378 m->m_pkthdr.len = m->m_len = wrlen; 4379 4380 req = (struct cpl_set_tcb_field *)(wr + 1); 4381 if (len0) { /* program buffer 0 offset and length */ 4382 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET, 4383 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 4384 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 4385 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) | 4386 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0)); 4387 req++; 4388 } 4389 if (len1) { /* program buffer 1 offset and length */ 4390 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET, 4391 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 4392 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32, 4393 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) | 4394 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32); 4395 req++; 4396 } 4397 4398 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask, 4399 ddp_flags); 4400 4401 if (modulate) { 4402 mk_rx_data_ack_ulp(toep, 4403 (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid, 4404 toep->tp_copied_seq - toep->tp_rcv_wup); 4405 toep->tp_rcv_wup = toep->tp_copied_seq; 4406 } 4407 4408#ifdef T3_TRACE 4409 T3_TRACE5(TIDTB(sk), 4410 "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x " 4411 "modulate %d", 4412 len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff, 4413 modulate); 4414#endif 4415 4416 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); 4417} 4418 4419void 4420t3_init_wr_tab(unsigned int wr_len) 4421{ 4422 int i; 4423 4424 if (mbuf_wrs[1]) /* already initialized */ 4425 return; 4426 4427 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) { 4428 int sgl_len = (3 * i) / 2 + (i & 1); 4429 4430 sgl_len += 3; 4431 mbuf_wrs[i] = sgl_len <= wr_len ? 4432 1 : 1 + (sgl_len - 2) / (wr_len - 1); 4433 } 4434 4435 wrlen = wr_len * 8; 4436} 4437 4438int 4439t3_init_cpl_io(void) 4440{ 4441#ifdef notyet 4442 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); 4443 if (!tcphdr_skb) { 4444 log(LOG_ERR, 4445 "Chelsio TCP offload: can't allocate sk_buff\n"); 4446 return -1; 4447 } 4448 skb_put(tcphdr_skb, sizeof(struct tcphdr)); 4449 tcphdr_skb->h.raw = tcphdr_skb->data; 4450 memset(tcphdr_skb->data, 0, tcphdr_skb->len); 4451#endif 4452 4453 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); 4454 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); 4455 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); 4456 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); 4457 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); 4458 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); 4459 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 4460 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 4461 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); 4462 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); 4463 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); 4464 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 4465 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); 4466 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); 4467 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); 4468 return (0); 4469} 4470 4471