1239344Snp/*- 2239344Snp * Copyright (c) 2012 Chelsio Communications, Inc. 3239344Snp * All rights reserved. 4239344Snp * Written by: Navdeep Parhar <np@FreeBSD.org> 5239344Snp * 6239344Snp * Redistribution and use in source and binary forms, with or without 7239344Snp * modification, are permitted provided that the following conditions 8239344Snp * are met: 9239344Snp * 1. Redistributions of source code must retain the above copyright 10239344Snp * notice, this list of conditions and the following disclaimer. 11239344Snp * 2. Redistributions in binary form must reproduce the above copyright 12239344Snp * notice, this list of conditions and the following disclaimer in the 13239344Snp * documentation and/or other materials provided with the distribution. 14239344Snp * 15239344Snp * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16239344Snp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17239344Snp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18239344Snp * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19239344Snp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20239344Snp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21239344Snp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22239344Snp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23239344Snp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24239344Snp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25239344Snp * SUCH DAMAGE. 26239344Snp */ 27239344Snp 28239344Snp#include <sys/cdefs.h> 29239344Snp__FBSDID("$FreeBSD: stable/10/sys/dev/cxgbe/tom/t4_ddp.c 312337 2017-01-17 07:43:37Z np $"); 30239344Snp 31239344Snp#include "opt_inet.h" 32239344Snp 33239344Snp#include <sys/param.h> 34239344Snp#include <sys/types.h> 35239344Snp#include <sys/systm.h> 36239344Snp#include <sys/kernel.h> 37239344Snp#include <sys/ktr.h> 38239344Snp#include <sys/module.h> 39239344Snp#include <sys/protosw.h> 40239344Snp#include <sys/proc.h> 41239344Snp#include <sys/domain.h> 42239344Snp#include <sys/socket.h> 43239344Snp#include <sys/socketvar.h> 44239344Snp#include <sys/uio.h> 45239344Snp#include <netinet/in.h> 46239344Snp#include <netinet/in_pcb.h> 47239344Snp#include <netinet/ip.h> 48239344Snp#include <netinet/tcp_var.h> 49239344Snp#define TCPSTATES 50239344Snp#include <netinet/tcp_fsm.h> 51239344Snp#include <netinet/toecore.h> 52239344Snp 53239344Snp#include <vm/vm.h> 54239344Snp#include <vm/vm_extern.h> 55239344Snp#include <vm/vm_param.h> 56239344Snp#include <vm/pmap.h> 57239344Snp#include <vm/vm_map.h> 58239344Snp#include <vm/vm_page.h> 59239344Snp#include <vm/vm_object.h> 60239344Snp 61239344Snp#ifdef TCP_OFFLOAD 62239344Snp#include "common/common.h" 63239344Snp#include "common/t4_msg.h" 64239344Snp#include "common/t4_regs.h" 65239344Snp#include "common/t4_tcb.h" 66239344Snp#include "tom/t4_tom.h" 67239344Snp 68281315SjhbVNET_DECLARE(int, tcp_do_autorcvbuf); 69281315Sjhb#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 70281315SjhbVNET_DECLARE(int, tcp_autorcvbuf_inc); 71281315Sjhb#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 72281315SjhbVNET_DECLARE(int, tcp_autorcvbuf_max); 73281315Sjhb#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 74281315Sjhb 75308281Sjhbstatic struct mbuf *get_ddp_mbuf(int len); 76308281Sjhb 77239344Snp#define MAX_DDP_BUFFER_SIZE (M_TCB_RX_DDP_BUF0_LEN) 78239344Snp 79309556Sjhbstatic struct ddp_buffer * 80309556Sjhballoc_ddp_buffer(vm_page_t *pages, int npages, int offset, int len) 81239344Snp{ 82309556Sjhb struct ddp_buffer *db; 83239344Snp 84309556Sjhb db = malloc(sizeof(*db), M_CXGBE, M_NOWAIT | M_ZERO); 85309556Sjhb if (db == NULL) { 86309556Sjhb CTR1(KTR_CXGBE, "%s: malloc failed.", __func__); 87309556Sjhb return (NULL); 88309556Sjhb } 89239344Snp 90309556Sjhb db->npages = npages; 91309556Sjhb db->pages = pages; 92309556Sjhb db->offset = offset; 93309556Sjhb db->len = len; 94239344Snp 95309556Sjhb return (db); 96239344Snp} 97239344Snp 98239344Snpstatic void 99309556Sjhbfree_ddp_buffer(struct ddp_buffer *db) 100239344Snp{ 101239344Snp 102239344Snp if (db == NULL) 103239344Snp return; 104239344Snp 105239344Snp if (db->pages) 106239344Snp free(db->pages, M_CXGBE); 107239344Snp 108309556Sjhb if (db->prsv.prsv_nppods > 0) 109309556Sjhb t4_free_page_pods(&db->prsv); 110239344Snp 111239344Snp free(db, M_CXGBE); 112239344Snp} 113239344Snp 114239344Snpvoid 115239344Snprelease_ddp_resources(struct toepcb *toep) 116239344Snp{ 117239344Snp int i; 118239344Snp 119240452Snp for (i = 0; i < nitems(toep->db); i++) { 120239344Snp if (toep->db[i] != NULL) { 121309556Sjhb free_ddp_buffer(toep->db[i]); 122239344Snp toep->db[i] = NULL; 123239344Snp } 124239344Snp } 125239344Snp} 126239344Snp 127243681Snp/* XXX: handle_ddp_data code duplication */ 128243681Snpvoid 129243681Snpinsert_ddp_data(struct toepcb *toep, uint32_t n) 130243681Snp{ 131243681Snp struct inpcb *inp = toep->inp; 132243681Snp struct tcpcb *tp = intotcpcb(inp); 133243681Snp struct sockbuf *sb = &inp->inp_socket->so_rcv; 134243681Snp struct mbuf *m; 135243681Snp 136243681Snp INP_WLOCK_ASSERT(inp); 137243681Snp SOCKBUF_LOCK_ASSERT(sb); 138243681Snp 139250218Snp m = get_ddp_mbuf(n); 140243681Snp tp->rcv_nxt += n; 141243681Snp#ifndef USE_DDP_RX_FLOW_CONTROL 142243681Snp KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__)); 143243681Snp tp->rcv_wnd -= n; 144243681Snp#endif 145243681Snp 146243681Snp KASSERT(toep->sb_cc >= sb->sb_cc, 147243681Snp ("%s: sb %p has more data (%d) than last time (%d).", 148243681Snp __func__, sb, sb->sb_cc, toep->sb_cc)); 149243681Snp toep->rx_credits += toep->sb_cc - sb->sb_cc; 150243681Snp#ifdef USE_DDP_RX_FLOW_CONTROL 151243681Snp toep->rx_credits -= n; /* adjust for F_RX_FC_DDP */ 152243681Snp#endif 153243681Snp sbappendstream_locked(sb, m); 154243681Snp toep->sb_cc = sb->sb_cc; 155243681Snp} 156243681Snp 157239344Snp/* SET_TCB_FIELD sent as a ULP command looks like this */ 158239344Snp#define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ 159239344Snp sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) 160239344Snp 161239344Snp/* RX_DATA_ACK sent as a ULP command looks like this */ 162239344Snp#define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \ 163239344Snp sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core)) 164239344Snp 165239344Snpstatic inline void * 166239344Snpmk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep, 167239344Snp uint64_t word, uint64_t mask, uint64_t val) 168239344Snp{ 169239344Snp struct ulptx_idata *ulpsc; 170239344Snp struct cpl_set_tcb_field_core *req; 171239344Snp 172239344Snp ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); 173239344Snp ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); 174239344Snp 175239344Snp ulpsc = (struct ulptx_idata *)(ulpmc + 1); 176239344Snp ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 177239344Snp ulpsc->len = htobe32(sizeof(*req)); 178239344Snp 179239344Snp req = (struct cpl_set_tcb_field_core *)(ulpsc + 1); 180239344Snp OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid)); 181239344Snp req->reply_ctrl = htobe16(V_NO_REPLY(1) | 182239344Snp V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 183239344Snp req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0)); 184239344Snp req->mask = htobe64(mask); 185239344Snp req->val = htobe64(val); 186239344Snp 187239344Snp ulpsc = (struct ulptx_idata *)(req + 1); 188239344Snp if (LEN__SET_TCB_FIELD_ULP % 16) { 189239344Snp ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); 190239344Snp ulpsc->len = htobe32(0); 191239344Snp return (ulpsc + 1); 192239344Snp } 193239344Snp return (ulpsc); 194239344Snp} 195239344Snp 196239344Snpstatic inline void * 197239344Snpmk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep) 198239344Snp{ 199239344Snp struct ulptx_idata *ulpsc; 200239344Snp struct cpl_rx_data_ack_core *req; 201239344Snp 202239344Snp ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); 203239344Snp ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16)); 204239344Snp 205239344Snp ulpsc = (struct ulptx_idata *)(ulpmc + 1); 206239344Snp ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 207239344Snp ulpsc->len = htobe32(sizeof(*req)); 208239344Snp 209239344Snp req = (struct cpl_rx_data_ack_core *)(ulpsc + 1); 210239344Snp OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid)); 211239344Snp req->credit_dack = htobe32(F_RX_MODULATE_RX); 212239344Snp 213239344Snp ulpsc = (struct ulptx_idata *)(req + 1); 214239344Snp if (LEN__RX_DATA_ACK_ULP % 16) { 215239344Snp ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); 216239344Snp ulpsc->len = htobe32(0); 217239344Snp return (ulpsc + 1); 218239344Snp } 219239344Snp return (ulpsc); 220239344Snp} 221239344Snp 222239344Snpstatic inline uint64_t 223239344Snpselect_ddp_flags(struct socket *so, int flags, int db_idx) 224239344Snp{ 225239344Snp uint64_t ddp_flags = V_TF_DDP_INDICATE_OUT(0); 226239344Snp int waitall = flags & MSG_WAITALL; 227239344Snp int nb = so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO); 228239344Snp 229239344Snp KASSERT(db_idx == 0 || db_idx == 1, 230239344Snp ("%s: bad DDP buffer index %d", __func__, db_idx)); 231239344Snp 232239344Snp if (db_idx == 0) { 233239344Snp ddp_flags |= V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0); 234239344Snp if (waitall) 235239344Snp ddp_flags |= V_TF_DDP_PUSH_DISABLE_0(1); 236239344Snp else if (nb) 237239344Snp ddp_flags |= V_TF_DDP_BUF0_FLUSH(1); 238239344Snp else 239239344Snp ddp_flags |= V_TF_DDP_BUF0_FLUSH(0); 240239344Snp } else { 241239344Snp ddp_flags |= V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1); 242239344Snp if (waitall) 243239344Snp ddp_flags |= V_TF_DDP_PUSH_DISABLE_1(1); 244239344Snp else if (nb) 245239344Snp ddp_flags |= V_TF_DDP_BUF1_FLUSH(1); 246239344Snp else 247239344Snp ddp_flags |= V_TF_DDP_BUF1_FLUSH(0); 248239344Snp } 249239344Snp 250239344Snp return (ddp_flags); 251239344Snp} 252239344Snp 253239344Snpstatic struct wrqe * 254239344Snpmk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx, 255239344Snp int offset, uint64_t ddp_flags) 256239344Snp{ 257239344Snp struct ddp_buffer *db = toep->db[db_idx]; 258239344Snp struct wrqe *wr; 259239344Snp struct work_request_hdr *wrh; 260239344Snp struct ulp_txpkt *ulpmc; 261239344Snp int len; 262239344Snp 263239344Snp KASSERT(db_idx == 0 || db_idx == 1, 264239344Snp ("%s: bad DDP buffer index %d", __func__, db_idx)); 265239344Snp 266239344Snp /* 267239344Snp * We'll send a compound work request that has 3 SET_TCB_FIELDs and an 268239344Snp * RX_DATA_ACK (with RX_MODULATE to speed up delivery). 269239344Snp * 270239344Snp * The work request header is 16B and always ends at a 16B boundary. 271239344Snp * The ULPTX master commands that follow must all end at 16B boundaries 272239344Snp * too so we round up the size to 16. 273239344Snp */ 274248925Snp len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) + 275248925Snp roundup2(LEN__RX_DATA_ACK_ULP, 16); 276239344Snp 277239344Snp wr = alloc_wrqe(len, toep->ctrlq); 278239344Snp if (wr == NULL) 279239344Snp return (NULL); 280239344Snp wrh = wrtod(wr); 281239344Snp INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ 282239344Snp ulpmc = (struct ulp_txpkt *)(wrh + 1); 283239344Snp 284239344Snp /* Write the buffer's tag */ 285239344Snp ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 286239344Snp W_TCB_RX_DDP_BUF0_TAG + db_idx, 287239344Snp V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 288309556Sjhb V_TCB_RX_DDP_BUF0_TAG(db->prsv.prsv_tag)); 289239344Snp 290239344Snp /* Update the current offset in the DDP buffer and its total length */ 291239344Snp if (db_idx == 0) 292239344Snp ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 293239344Snp W_TCB_RX_DDP_BUF0_OFFSET, 294239344Snp V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 295239344Snp V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 296239344Snp V_TCB_RX_DDP_BUF0_OFFSET(offset) | 297239344Snp V_TCB_RX_DDP_BUF0_LEN(db->len)); 298239344Snp else 299239344Snp ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 300239344Snp W_TCB_RX_DDP_BUF1_OFFSET, 301239344Snp V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 302239344Snp V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32), 303239344Snp V_TCB_RX_DDP_BUF1_OFFSET(offset) | 304239344Snp V_TCB_RX_DDP_BUF1_LEN((u64)db->len << 32)); 305239344Snp 306239344Snp /* Update DDP flags */ 307239344Snp ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS, 308239344Snp V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF1_FLUSH(1) | 309239344Snp V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PUSH_DISABLE_1(1) | 310239344Snp V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1) | 311239344Snp V_TF_DDP_ACTIVE_BUF(1) | V_TF_DDP_INDICATE_OUT(1), ddp_flags); 312239344Snp 313239344Snp /* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */ 314239344Snp ulpmc = mk_rx_data_ack_ulp(ulpmc, toep); 315239344Snp 316239344Snp return (wr); 317239344Snp} 318239344Snp 319239344Snpstatic void 320239344Snpdiscourage_ddp(struct toepcb *toep) 321239344Snp{ 322239344Snp 323239344Snp if (toep->ddp_score && --toep->ddp_score == 0) { 324239344Snp toep->ddp_flags &= ~DDP_OK; 325239344Snp toep->ddp_disabled = time_uptime; 326239344Snp CTR3(KTR_CXGBE, "%s: tid %u !DDP_OK @ %u", 327239344Snp __func__, toep->tid, time_uptime); 328239344Snp } 329239344Snp} 330239344Snp 331239344Snpstatic int 332239344Snphandle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) 333239344Snp{ 334239344Snp uint32_t report = be32toh(ddp_report); 335239344Snp unsigned int db_flag; 336239344Snp struct inpcb *inp = toep->inp; 337239344Snp struct tcpcb *tp; 338239344Snp struct socket *so; 339239344Snp struct sockbuf *sb; 340239344Snp struct mbuf *m; 341239344Snp 342239344Snp db_flag = report & F_DDP_BUF_IDX ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; 343239344Snp 344239344Snp if (__predict_false(!(report & F_DDP_INV))) 345239344Snp CXGBE_UNIMPLEMENTED("DDP buffer still valid"); 346239344Snp 347239344Snp INP_WLOCK(inp); 348239344Snp so = inp_inpcbtosocket(inp); 349239344Snp sb = &so->so_rcv; 350239344Snp if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { 351239344Snp 352239344Snp /* 353239344Snp * XXX: think a bit more. 354239344Snp * tcpcb probably gone, but socket should still be around 355239344Snp * because we always wait for DDP completion in soreceive no 356239344Snp * matter what. Just wake it up and let it clean up. 357239344Snp */ 358239344Snp 359239344Snp CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x", 360239344Snp __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags); 361239344Snp SOCKBUF_LOCK(sb); 362239344Snp goto wakeup; 363239344Snp } 364239344Snp 365239344Snp tp = intotcpcb(inp); 366308281Sjhb 367308281Sjhb /* 368308281Sjhb * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the 369308281Sjhb * sequence number of the next byte to receive. The length of 370308281Sjhb * the data received for this message must be computed by 371308281Sjhb * comparing the new and old values of rcv_nxt. 372308281Sjhb * 373308281Sjhb * For RX_DATA_DDP, len might be non-zero, but it is only the 374308281Sjhb * length of the most recent DMA. It does not include the 375308281Sjhb * total length of the data received since the previous update 376308281Sjhb * for this DDP buffer. rcv_nxt is the sequence number of the 377308281Sjhb * first received byte from the most recent DMA. 378308281Sjhb */ 379239344Snp len += be32toh(rcv_nxt) - tp->rcv_nxt; 380239344Snp tp->rcv_nxt += len; 381239344Snp tp->t_rcvtime = ticks; 382239344Snp#ifndef USE_DDP_RX_FLOW_CONTROL 383239344Snp KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__)); 384239344Snp tp->rcv_wnd -= len; 385239344Snp#endif 386250218Snp m = get_ddp_mbuf(len); 387239344Snp 388239344Snp SOCKBUF_LOCK(sb); 389239344Snp if (report & F_DDP_BUF_COMPLETE) 390239344Snp toep->ddp_score = DDP_HIGH_SCORE; 391239344Snp else 392239344Snp discourage_ddp(toep); 393239344Snp 394281315Sjhb /* receive buffer autosize */ 395312117Snp MPASS(toep->vnet == so->so_vnet); 396312117Snp CURVNET_SET(toep->vnet); 397281315Sjhb if (sb->sb_flags & SB_AUTOSIZE && 398281315Sjhb V_tcp_do_autorcvbuf && 399281315Sjhb sb->sb_hiwat < V_tcp_autorcvbuf_max && 400281315Sjhb len > (sbspace(sb) / 8 * 7)) { 401281315Sjhb unsigned int hiwat = sb->sb_hiwat; 402281315Sjhb unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, 403281315Sjhb V_tcp_autorcvbuf_max); 404281315Sjhb 405281315Sjhb if (!sbreserve_locked(sb, newsize, so, NULL)) 406281315Sjhb sb->sb_flags &= ~SB_AUTOSIZE; 407281315Sjhb else 408281315Sjhb toep->rx_credits += newsize - hiwat; 409281315Sjhb } 410312117Snp CURVNET_RESTORE(); 411281315Sjhb 412239344Snp KASSERT(toep->sb_cc >= sb->sb_cc, 413239344Snp ("%s: sb %p has more data (%d) than last time (%d).", 414239344Snp __func__, sb, sb->sb_cc, toep->sb_cc)); 415239344Snp toep->rx_credits += toep->sb_cc - sb->sb_cc; 416239344Snp#ifdef USE_DDP_RX_FLOW_CONTROL 417239344Snp toep->rx_credits -= len; /* adjust for F_RX_FC_DDP */ 418239344Snp#endif 419239344Snp sbappendstream_locked(sb, m); 420239344Snp toep->sb_cc = sb->sb_cc; 421239344Snpwakeup: 422239344Snp KASSERT(toep->ddp_flags & db_flag, 423239344Snp ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x, report 0x%x", 424239344Snp __func__, toep, toep->ddp_flags, report)); 425239344Snp toep->ddp_flags &= ~db_flag; 426239344Snp sorwakeup_locked(so); 427239344Snp SOCKBUF_UNLOCK_ASSERT(sb); 428239344Snp 429239344Snp INP_WUNLOCK(inp); 430239344Snp return (0); 431239344Snp} 432239344Snp 433308281Sjhbvoid 434308281Sjhbhandle_ddp_close(struct toepcb *toep, struct tcpcb *tp, struct sockbuf *sb, 435308281Sjhb __be32 rcv_nxt) 436308281Sjhb{ 437308281Sjhb struct mbuf *m; 438308281Sjhb int len; 439308281Sjhb 440308281Sjhb SOCKBUF_LOCK_ASSERT(sb); 441308281Sjhb INP_WLOCK_ASSERT(toep->inp); 442308281Sjhb len = be32toh(rcv_nxt) - tp->rcv_nxt; 443308281Sjhb 444308281Sjhb /* Signal handle_ddp() to break out of its sleep loop. */ 445308281Sjhb toep->ddp_flags &= ~(DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE); 446308281Sjhb if (len == 0) 447308281Sjhb return; 448308281Sjhb 449308281Sjhb tp->rcv_nxt += len; 450308281Sjhb KASSERT(toep->sb_cc >= sb->sb_cc, 451308281Sjhb ("%s: sb %p has more data (%d) than last time (%d).", 452308281Sjhb __func__, sb, sb->sb_cc, toep->sb_cc)); 453308281Sjhb toep->rx_credits += toep->sb_cc - sb->sb_cc; 454308281Sjhb#ifdef USE_DDP_RX_FLOW_CONTROL 455308281Sjhb toep->rx_credits -= len; /* adjust for F_RX_FC_DDP */ 456308281Sjhb#endif 457308281Sjhb 458308281Sjhb m = get_ddp_mbuf(len); 459308281Sjhb 460308281Sjhb sbappendstream_locked(sb, m); 461308281Sjhb toep->sb_cc = sb->sb_cc; 462308281Sjhb} 463308281Sjhb 464239344Snp#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 465239344Snp F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 466239344Snp F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 467239344Snp F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR) 468239344Snp 469309442Sjhbextern cpl_handler_t t4_cpl_handler[]; 470309442Sjhb 471239344Snpstatic int 472239344Snpdo_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 473239344Snp{ 474239344Snp struct adapter *sc = iq->adapter; 475239344Snp const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); 476239344Snp unsigned int tid = GET_TID(cpl); 477239344Snp uint32_t vld; 478239344Snp struct toepcb *toep = lookup_tid(sc, tid); 479239344Snp 480239344Snp KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 481239344Snp KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); 482239514Snp KASSERT(!(toep->flags & TPF_SYNQE), 483239344Snp ("%s: toep %p claims to be a synq entry", __func__, toep)); 484239344Snp 485239344Snp vld = be32toh(cpl->ddpvld); 486239344Snp if (__predict_false(vld & DDP_ERR)) { 487239344Snp panic("%s: DDP error 0x%x (tid %d, toep %p)", 488239344Snp __func__, vld, tid, toep); 489239344Snp } 490309440Sjhb 491270297Snp if (toep->ulp_mode == ULP_MODE_ISCSI) { 492309442Sjhb t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m); 493309440Sjhb return (0); 494309440Sjhb } 495239344Snp 496239344Snp handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len)); 497239344Snp 498239344Snp return (0); 499239344Snp} 500239344Snp 501239344Snpstatic int 502239344Snpdo_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss, 503239344Snp struct mbuf *m) 504239344Snp{ 505239344Snp struct adapter *sc = iq->adapter; 506239344Snp const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1); 507239344Snp unsigned int tid = GET_TID(cpl); 508239344Snp struct toepcb *toep = lookup_tid(sc, tid); 509239344Snp 510239344Snp KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 511239344Snp KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); 512239514Snp KASSERT(!(toep->flags & TPF_SYNQE), 513239344Snp ("%s: toep %p claims to be a synq entry", __func__, toep)); 514239344Snp 515239344Snp handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0); 516239344Snp 517239344Snp return (0); 518239344Snp} 519239344Snp 520239344Snpvoid 521239344Snpenable_ddp(struct adapter *sc, struct toepcb *toep) 522239344Snp{ 523239344Snp 524239344Snp KASSERT((toep->ddp_flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK, 525239344Snp ("%s: toep %p has bad ddp_flags 0x%x", 526239344Snp __func__, toep, toep->ddp_flags)); 527239344Snp 528239344Snp CTR3(KTR_CXGBE, "%s: tid %u (time %u)", 529239344Snp __func__, toep->tid, time_uptime); 530239344Snp 531239344Snp toep->ddp_flags |= DDP_SC_REQ; 532309442Sjhb t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_RX_DDP_FLAGS, 533239344Snp V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) | 534239344Snp V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) | 535239344Snp V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1), 536309442Sjhb V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1), 0, 0, 537309442Sjhb toep->ofld_rxq->iq.abs_id); 538309442Sjhb t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS, 539309442Sjhb V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0, toep->ofld_rxq->iq.abs_id); 540239344Snp} 541239344Snp 542239344Snpstatic inline void 543239344Snpdisable_ddp(struct adapter *sc, struct toepcb *toep) 544239344Snp{ 545239344Snp 546239344Snp KASSERT((toep->ddp_flags & (DDP_ON | DDP_SC_REQ)) == DDP_ON, 547239344Snp ("%s: toep %p has bad ddp_flags 0x%x", 548239344Snp __func__, toep, toep->ddp_flags)); 549239344Snp 550239344Snp CTR3(KTR_CXGBE, "%s: tid %u (time %u)", 551239344Snp __func__, toep->tid, time_uptime); 552239344Snp 553239344Snp toep->ddp_flags |= DDP_SC_REQ; 554309442Sjhb t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS, 555309442Sjhb V_TF_RCV_COALESCE_ENABLE(1), V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 556309442Sjhb toep->ofld_rxq->iq.abs_id); 557309442Sjhb t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_RX_DDP_FLAGS, 558309442Sjhb V_TF_DDP_OFF(1), V_TF_DDP_OFF(1), 0, 0, toep->ofld_rxq->iq.abs_id); 559239344Snp} 560239344Snp 561239344Snpstatic int 562239344Snphold_uio(struct uio *uio, vm_page_t **ppages, int *pnpages) 563239344Snp{ 564239344Snp struct vm_map *map; 565239344Snp struct iovec *iov; 566239344Snp vm_offset_t start, end; 567239344Snp vm_page_t *pp; 568239344Snp int n; 569239344Snp 570239344Snp KASSERT(uio->uio_iovcnt == 1, 571239344Snp ("%s: uio_iovcnt %d", __func__, uio->uio_iovcnt)); 572239344Snp KASSERT(uio->uio_td->td_proc == curproc, 573239344Snp ("%s: uio proc (%p) is not curproc (%p)", 574239344Snp __func__, uio->uio_td->td_proc, curproc)); 575239344Snp 576239344Snp map = &curproc->p_vmspace->vm_map; 577239344Snp iov = &uio->uio_iov[0]; 578239344Snp start = trunc_page((uintptr_t)iov->iov_base); 579239344Snp end = round_page((vm_offset_t)iov->iov_base + iov->iov_len); 580239344Snp n = howmany(end - start, PAGE_SIZE); 581239344Snp 582239344Snp if (end - start > MAX_DDP_BUFFER_SIZE) 583239344Snp return (E2BIG); 584239344Snp 585239344Snp pp = malloc(n * sizeof(vm_page_t), M_CXGBE, M_NOWAIT); 586239344Snp if (pp == NULL) 587239344Snp return (ENOMEM); 588239344Snp 589239344Snp if (vm_fault_quick_hold_pages(map, (vm_offset_t)iov->iov_base, 590239344Snp iov->iov_len, VM_PROT_WRITE, pp, n) < 0) { 591239344Snp free(pp, M_CXGBE); 592239344Snp return (EFAULT); 593239344Snp } 594239344Snp 595239344Snp *ppages = pp; 596239344Snp *pnpages = n; 597239344Snp 598239344Snp return (0); 599239344Snp} 600239344Snp 601239344Snpstatic int 602239344Snpbufcmp(struct ddp_buffer *db, vm_page_t *pages, int npages, int offset, int len) 603239344Snp{ 604239344Snp int i; 605239344Snp 606239344Snp if (db == NULL || db->npages != npages || db->offset != offset || 607239344Snp db->len != len) 608239344Snp return (1); 609239344Snp 610239344Snp for (i = 0; i < npages; i++) { 611239344Snp if (pages[i]->phys_addr != db->pages[i]->phys_addr) 612239344Snp return (1); 613239344Snp } 614239344Snp 615239344Snp return (0); 616239344Snp} 617239344Snp 618239344Snpstatic int 619239344Snpcalculate_hcf(int n1, int n2) 620239344Snp{ 621239344Snp int a, b, t; 622239344Snp 623239344Snp if (n1 <= n2) { 624239344Snp a = n1; 625239344Snp b = n2; 626239344Snp } else { 627239344Snp a = n2; 628239344Snp b = n1; 629239344Snp } 630239344Snp 631239344Snp while (a != 0) { 632239344Snp t = a; 633239344Snp a = b % a; 634239344Snp b = t; 635239344Snp } 636239344Snp 637239344Snp return (b); 638239344Snp} 639239344Snp 640309556Sjhbstatic inline int 641309556Sjhbpages_to_nppods(int npages, int ddp_page_shift) 642239344Snp{ 643239344Snp 644309556Sjhb MPASS(ddp_page_shift >= PAGE_SHIFT); 645309556Sjhb 646309556Sjhb return (howmany(npages >> (ddp_page_shift - PAGE_SHIFT), PPOD_PAGES)); 647309556Sjhb} 648309556Sjhb 649309556Sjhbstatic int 650309556Sjhballoc_page_pods(struct ppod_region *pr, u_int nppods, u_int pgsz_idx, 651309556Sjhb struct ppod_reservation *prsv) 652309556Sjhb{ 653309556Sjhb vmem_addr_t addr; /* relative to start of region */ 654309556Sjhb 655309556Sjhb if (vmem_alloc(pr->pr_arena, PPOD_SZ(nppods), M_NOWAIT | M_FIRSTFIT, 656309556Sjhb &addr) != 0) 657309556Sjhb return (ENOMEM); 658309556Sjhb 659309556Sjhb CTR5(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d, pgsz %d", 660309556Sjhb __func__, pr->pr_arena, (uint32_t)addr & pr->pr_tag_mask, 661309556Sjhb nppods, 1 << pr->pr_page_shift[pgsz_idx]); 662309556Sjhb 663239344Snp /* 664309556Sjhb * The hardware tagmask includes an extra invalid bit but the arena was 665309556Sjhb * seeded with valid values only. An allocation out of this arena will 666309556Sjhb * fit inside the tagmask but won't have the invalid bit set. 667309556Sjhb */ 668309556Sjhb MPASS((addr & pr->pr_tag_mask) == addr); 669309556Sjhb MPASS((addr & pr->pr_invalid_bit) == 0); 670309556Sjhb 671309556Sjhb prsv->prsv_pr = pr; 672309556Sjhb prsv->prsv_tag = V_PPOD_PGSZ(pgsz_idx) | addr; 673309556Sjhb prsv->prsv_nppods = nppods; 674309556Sjhb 675309556Sjhb return (0); 676309556Sjhb} 677309556Sjhb 678309556Sjhbint 679309556Sjhbt4_alloc_page_pods_for_db(struct ppod_region *pr, struct ddp_buffer *db) 680309556Sjhb{ 681309556Sjhb int i, hcf, seglen, idx, nppods; 682309556Sjhb struct ppod_reservation *prsv = &db->prsv; 683309556Sjhb 684309556Sjhb KASSERT(prsv->prsv_nppods == 0, 685309556Sjhb ("%s: page pods already allocated", __func__)); 686309556Sjhb 687309556Sjhb /* 688239344Snp * The DDP page size is unrelated to the VM page size. We combine 689239344Snp * contiguous physical pages into larger segments to get the best DDP 690239344Snp * page size possible. This is the largest of the four sizes in 691239344Snp * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in 692239344Snp * the page list. 693239344Snp */ 694239344Snp hcf = 0; 695309556Sjhb for (i = 0; i < db->npages; i++) { 696239344Snp seglen = PAGE_SIZE; 697309556Sjhb while (i < db->npages - 1 && 698309556Sjhb db->pages[i]->phys_addr + PAGE_SIZE == 699309556Sjhb db->pages[i + 1]->phys_addr) { 700239344Snp seglen += PAGE_SIZE; 701239344Snp i++; 702239344Snp } 703239344Snp 704239344Snp hcf = calculate_hcf(hcf, seglen); 705309556Sjhb if (hcf < (1 << pr->pr_page_shift[1])) { 706239344Snp idx = 0; 707239344Snp goto have_pgsz; /* give up, short circuit */ 708239344Snp } 709239344Snp } 710239344Snp 711309556Sjhb#define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1) 712309556Sjhb MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */ 713309556Sjhb for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) { 714309556Sjhb if ((hcf & PR_PAGE_MASK(idx)) == 0) 715309556Sjhb break; 716239344Snp } 717309556Sjhb#undef PR_PAGE_MASK 718239344Snp 719239344Snphave_pgsz: 720281245Snp MPASS(idx <= M_PPOD_PGSZ); 721239344Snp 722309556Sjhb nppods = pages_to_nppods(db->npages, pr->pr_page_shift[idx]); 723309556Sjhb if (alloc_page_pods(pr, nppods, idx, prsv) != 0) 724309556Sjhb return (0); 725309556Sjhb MPASS(prsv->prsv_nppods > 0); 726309556Sjhb 727309556Sjhb return (1); 728309556Sjhb} 729309556Sjhb 730309556Sjhbint 731309556Sjhbt4_alloc_page_pods_for_buf(struct ppod_region *pr, vm_offset_t buf, int len, 732309556Sjhb struct ppod_reservation *prsv) 733309556Sjhb{ 734309556Sjhb int hcf, seglen, idx, npages, nppods; 735309556Sjhb uintptr_t start_pva, end_pva, pva, p1; 736309556Sjhb 737309556Sjhb MPASS(buf > 0); 738309556Sjhb MPASS(len > 0); 739309556Sjhb 740309556Sjhb /* 741309556Sjhb * The DDP page size is unrelated to the VM page size. We combine 742309556Sjhb * contiguous physical pages into larger segments to get the best DDP 743309556Sjhb * page size possible. This is the largest of the four sizes in 744309556Sjhb * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes 745309556Sjhb * in the page list. 746309556Sjhb */ 747309556Sjhb hcf = 0; 748309556Sjhb start_pva = trunc_page(buf); 749309556Sjhb end_pva = trunc_page(buf + len - 1); 750309556Sjhb pva = start_pva; 751309556Sjhb while (pva <= end_pva) { 752309556Sjhb seglen = PAGE_SIZE; 753309556Sjhb p1 = pmap_kextract(pva); 754309556Sjhb pva += PAGE_SIZE; 755309556Sjhb while (pva <= end_pva && p1 + seglen == pmap_kextract(pva)) { 756309556Sjhb seglen += PAGE_SIZE; 757309556Sjhb pva += PAGE_SIZE; 758309556Sjhb } 759309556Sjhb 760309556Sjhb hcf = calculate_hcf(hcf, seglen); 761309556Sjhb if (hcf < (1 << pr->pr_page_shift[1])) { 762309556Sjhb idx = 0; 763309556Sjhb goto have_pgsz; /* give up, short circuit */ 764309556Sjhb } 765239344Snp } 766239344Snp 767309556Sjhb#define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1) 768309556Sjhb MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */ 769309556Sjhb for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) { 770309556Sjhb if ((hcf & PR_PAGE_MASK(idx)) == 0) 771309556Sjhb break; 772239344Snp } 773309556Sjhb#undef PR_PAGE_MASK 774239344Snp 775309556Sjhbhave_pgsz: 776309556Sjhb MPASS(idx <= M_PPOD_PGSZ); 777239344Snp 778309556Sjhb npages = 1; 779309556Sjhb npages += (end_pva - start_pva) >> pr->pr_page_shift[idx]; 780309556Sjhb nppods = howmany(npages, PPOD_PAGES); 781309556Sjhb if (alloc_page_pods(pr, nppods, idx, prsv) != 0) 782309556Sjhb return (ENOMEM); 783309556Sjhb MPASS(prsv->prsv_nppods > 0); 784239344Snp 785309556Sjhb return (0); 786239344Snp} 787239344Snp 788309556Sjhbvoid 789309556Sjhbt4_free_page_pods(struct ppod_reservation *prsv) 790309556Sjhb{ 791309556Sjhb struct ppod_region *pr = prsv->prsv_pr; 792309556Sjhb vmem_addr_t addr; 793309556Sjhb 794309556Sjhb MPASS(prsv != NULL); 795309556Sjhb MPASS(prsv->prsv_nppods != 0); 796309556Sjhb 797309556Sjhb addr = prsv->prsv_tag & pr->pr_tag_mask; 798309556Sjhb MPASS((addr & pr->pr_invalid_bit) == 0); 799309556Sjhb 800309556Sjhb CTR4(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d", __func__, 801309556Sjhb pr->pr_arena, addr, prsv->prsv_nppods); 802309556Sjhb 803309556Sjhb vmem_free(pr->pr_arena, addr, PPOD_SZ(prsv->prsv_nppods)); 804309556Sjhb prsv->prsv_nppods = 0; 805309556Sjhb} 806309556Sjhb 807239344Snp#define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE) 808239344Snp 809309556Sjhbint 810309556Sjhbt4_write_page_pods_for_db(struct adapter *sc, struct sge_wrq *wrq, int tid, 811309556Sjhb struct ddp_buffer *db) 812239344Snp{ 813239344Snp struct wrqe *wr; 814239344Snp struct ulp_mem_io *ulpmc; 815239344Snp struct ulptx_idata *ulpsc; 816239344Snp struct pagepod *ppod; 817281245Snp int i, j, k, n, chunk, len, ddp_pgsz, idx; 818281245Snp u_int ppod_addr; 819250117Snp uint32_t cmd; 820309556Sjhb struct ppod_reservation *prsv = &db->prsv; 821309556Sjhb struct ppod_region *pr = prsv->prsv_pr; 822239344Snp 823309556Sjhb MPASS(prsv->prsv_nppods > 0); 824309556Sjhb 825250117Snp cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); 826250117Snp if (is_t4(sc)) 827250117Snp cmd |= htobe32(F_ULP_MEMIO_ORDER); 828250117Snp else 829250117Snp cmd |= htobe32(F_T5_ULP_MEMIO_IMM); 830309556Sjhb ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)]; 831309556Sjhb ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask); 832309556Sjhb for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) { 833239344Snp 834239344Snp /* How many page pods are we writing in this cycle */ 835309556Sjhb n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS); 836239344Snp chunk = PPOD_SZ(n); 837248925Snp len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); 838239344Snp 839309556Sjhb wr = alloc_wrqe(len, wrq); 840239344Snp if (wr == NULL) 841239344Snp return (ENOMEM); /* ok to just bail out */ 842239344Snp ulpmc = wrtod(wr); 843239344Snp 844239344Snp INIT_ULPTX_WR(ulpmc, len, 0, 0); 845250117Snp ulpmc->cmd = cmd; 846239344Snp ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); 847239344Snp ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); 848239344Snp ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); 849239344Snp 850239344Snp ulpsc = (struct ulptx_idata *)(ulpmc + 1); 851239344Snp ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 852239344Snp ulpsc->len = htobe32(chunk); 853239344Snp 854239344Snp ppod = (struct pagepod *)(ulpsc + 1); 855239344Snp for (j = 0; j < n; i++, j++, ppod++) { 856239344Snp ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | 857309556Sjhb V_PPOD_TID(tid) | prsv->prsv_tag); 858239344Snp ppod->len_offset = htobe64(V_PPOD_LEN(db->len) | 859239344Snp V_PPOD_OFST(db->offset)); 860239344Snp ppod->rsvd = 0; 861239344Snp idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE); 862240452Snp for (k = 0; k < nitems(ppod->addr); k++) { 863239344Snp if (idx < db->npages) { 864239344Snp ppod->addr[k] = 865239344Snp htobe64(db->pages[idx]->phys_addr); 866239344Snp idx += ddp_pgsz / PAGE_SIZE; 867239344Snp } else 868239344Snp ppod->addr[k] = 0; 869239344Snp#if 0 870239344Snp CTR5(KTR_CXGBE, 871239344Snp "%s: tid %d ppod[%d]->addr[%d] = %p", 872239344Snp __func__, toep->tid, i, k, 873239344Snp htobe64(ppod->addr[k])); 874239344Snp#endif 875239344Snp } 876239344Snp 877239344Snp } 878239344Snp 879239344Snp t4_wrq_tx(sc, wr); 880239344Snp } 881239344Snp 882239344Snp return (0); 883239344Snp} 884239344Snp 885309556Sjhbint 886309556Sjhbt4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid, 887309556Sjhb struct ppod_reservation *prsv, vm_offset_t buf, int buflen) 888309556Sjhb{ 889309556Sjhb struct wrqe *wr; 890309556Sjhb struct ulp_mem_io *ulpmc; 891309556Sjhb struct ulptx_idata *ulpsc; 892309556Sjhb struct pagepod *ppod; 893309556Sjhb int i, j, k, n, chunk, len, ddp_pgsz; 894309556Sjhb u_int ppod_addr, offset; 895309556Sjhb uint32_t cmd; 896309556Sjhb struct ppod_region *pr = prsv->prsv_pr; 897309556Sjhb uintptr_t end_pva, pva, pa; 898309556Sjhb 899309556Sjhb cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); 900309556Sjhb if (is_t4(sc)) 901309556Sjhb cmd |= htobe32(F_ULP_MEMIO_ORDER); 902309556Sjhb else 903309556Sjhb cmd |= htobe32(F_T5_ULP_MEMIO_IMM); 904309556Sjhb ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)]; 905309556Sjhb offset = buf & PAGE_MASK; 906309556Sjhb ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask); 907309556Sjhb pva = trunc_page(buf); 908309556Sjhb end_pva = trunc_page(buf + buflen - 1); 909309556Sjhb for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) { 910309556Sjhb 911309556Sjhb /* How many page pods are we writing in this cycle */ 912309556Sjhb n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS); 913309556Sjhb MPASS(n > 0); 914309556Sjhb chunk = PPOD_SZ(n); 915309556Sjhb len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); 916309556Sjhb 917309556Sjhb wr = alloc_wrqe(len, wrq); 918309556Sjhb if (wr == NULL) 919309556Sjhb return (ENOMEM); /* ok to just bail out */ 920309556Sjhb ulpmc = wrtod(wr); 921309556Sjhb 922309556Sjhb INIT_ULPTX_WR(ulpmc, len, 0, 0); 923309556Sjhb ulpmc->cmd = cmd; 924309556Sjhb ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); 925309556Sjhb ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); 926309556Sjhb ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); 927309556Sjhb 928309556Sjhb ulpsc = (struct ulptx_idata *)(ulpmc + 1); 929309556Sjhb ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 930309556Sjhb ulpsc->len = htobe32(chunk); 931309556Sjhb 932309556Sjhb ppod = (struct pagepod *)(ulpsc + 1); 933309556Sjhb for (j = 0; j < n; i++, j++, ppod++) { 934309556Sjhb ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | 935309556Sjhb V_PPOD_TID(tid) | 936309556Sjhb (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ))); 937309556Sjhb ppod->len_offset = htobe64(V_PPOD_LEN(buflen) | 938309556Sjhb V_PPOD_OFST(offset)); 939309556Sjhb ppod->rsvd = 0; 940309556Sjhb 941309556Sjhb for (k = 0; k < nitems(ppod->addr); k++) { 942309556Sjhb if (pva > end_pva) 943309556Sjhb ppod->addr[k] = 0; 944309556Sjhb else { 945309556Sjhb pa = pmap_kextract(pva); 946309556Sjhb ppod->addr[k] = htobe64(pa); 947309556Sjhb pva += ddp_pgsz; 948309556Sjhb } 949309556Sjhb#if 0 950309556Sjhb CTR5(KTR_CXGBE, 951309556Sjhb "%s: tid %d ppod[%d]->addr[%d] = %p", 952309556Sjhb __func__, tid, i, k, 953309556Sjhb htobe64(ppod->addr[k])); 954309556Sjhb#endif 955309556Sjhb } 956309556Sjhb 957309556Sjhb /* 958309556Sjhb * Walk back 1 segment so that the first address in the 959309556Sjhb * next pod is the same as the last one in the current 960309556Sjhb * pod. 961309556Sjhb */ 962309556Sjhb pva -= ddp_pgsz; 963309556Sjhb } 964309556Sjhb 965309556Sjhb t4_wrq_tx(sc, wr); 966309556Sjhb } 967309556Sjhb 968309556Sjhb MPASS(pva <= end_pva); 969309556Sjhb 970309556Sjhb return (0); 971309556Sjhb} 972309556Sjhb 973239344Snp/* 974239527Snp * Reuse, or allocate (and program the page pods for) a new DDP buffer. The 975239527Snp * "pages" array is handed over to this function and should not be used in any 976239527Snp * way by the caller after that. 977239344Snp */ 978239344Snpstatic int 979239344Snpselect_ddp_buffer(struct adapter *sc, struct toepcb *toep, vm_page_t *pages, 980239344Snp int npages, int db_off, int db_len) 981239344Snp{ 982239344Snp struct ddp_buffer *db; 983239344Snp struct tom_data *td = sc->tom_softc; 984239344Snp int i, empty_slot = -1; 985239344Snp 986239344Snp /* Try to reuse */ 987240452Snp for (i = 0; i < nitems(toep->db); i++) { 988239344Snp if (bufcmp(toep->db[i], pages, npages, db_off, db_len) == 0) { 989239344Snp free(pages, M_CXGBE); 990239344Snp return (i); /* pages still held */ 991239344Snp } else if (toep->db[i] == NULL && empty_slot < 0) 992239344Snp empty_slot = i; 993239344Snp } 994239344Snp 995239344Snp /* Allocate new buffer, write its page pods. */ 996309556Sjhb db = alloc_ddp_buffer(pages, npages, db_off, db_len); 997239344Snp if (db == NULL) { 998239344Snp vm_page_unhold_pages(pages, npages); 999239344Snp free(pages, M_CXGBE); 1000239344Snp return (-1); 1001239344Snp } 1002309556Sjhb if (t4_alloc_page_pods_for_db(&td->pr, db)) { 1003239344Snp vm_page_unhold_pages(pages, npages); 1004309556Sjhb free_ddp_buffer(db); 1005239344Snp return (-1); 1006239344Snp } 1007309556Sjhb if (t4_write_page_pods_for_db(sc, toep->ctrlq, toep->tid, db) != 0) { 1008309556Sjhb vm_page_unhold_pages(pages, npages); 1009309556Sjhb free_ddp_buffer(db); 1010309556Sjhb return (-1); 1011309556Sjhb } 1012239344Snp 1013239344Snp i = empty_slot; 1014239344Snp if (i < 0) { 1015240452Snp i = arc4random() % nitems(toep->db); 1016309556Sjhb free_ddp_buffer(toep->db[i]); 1017239344Snp } 1018239344Snp toep->db[i] = db; 1019239344Snp 1020239344Snp CTR5(KTR_CXGBE, "%s: tid %d, DDP buffer[%d] = %p (tag 0x%x)", 1021309556Sjhb __func__, toep->tid, i, db, db->prsv.prsv_tag); 1022239344Snp 1023239344Snp return (i); 1024239344Snp} 1025239344Snp 1026239344Snpstatic void 1027239344Snpwire_ddp_buffer(struct ddp_buffer *db) 1028239344Snp{ 1029239344Snp int i; 1030239344Snp vm_page_t p; 1031239344Snp 1032239344Snp for (i = 0; i < db->npages; i++) { 1033239344Snp p = db->pages[i]; 1034239344Snp vm_page_lock(p); 1035239344Snp vm_page_wire(p); 1036239344Snp vm_page_unhold(p); 1037239344Snp vm_page_unlock(p); 1038239344Snp } 1039239344Snp} 1040239344Snp 1041239344Snpstatic void 1042239344Snpunwire_ddp_buffer(struct ddp_buffer *db) 1043239344Snp{ 1044239344Snp int i; 1045239344Snp vm_page_t p; 1046239344Snp 1047239344Snp for (i = 0; i < db->npages; i++) { 1048239344Snp p = db->pages[i]; 1049239344Snp vm_page_lock(p); 1050239344Snp vm_page_unwire(p, 0); 1051239344Snp vm_page_unlock(p); 1052239344Snp } 1053239344Snp} 1054239344Snp 1055239344Snpstatic int 1056239344Snphandle_ddp(struct socket *so, struct uio *uio, int flags, int error) 1057239344Snp{ 1058239344Snp struct sockbuf *sb = &so->so_rcv; 1059239344Snp struct tcpcb *tp = so_sototcpcb(so); 1060239344Snp struct toepcb *toep = tp->t_toe; 1061239344Snp struct adapter *sc = td_adapter(toep->td); 1062239344Snp vm_page_t *pages; 1063239344Snp int npages, db_idx, rc, buf_flag; 1064239344Snp struct ddp_buffer *db; 1065239344Snp struct wrqe *wr; 1066239344Snp uint64_t ddp_flags; 1067239344Snp 1068239344Snp SOCKBUF_LOCK_ASSERT(sb); 1069239344Snp 1070239344Snp#if 0 1071239344Snp if (sb->sb_cc + sc->tt.ddp_thres > uio->uio_resid) { 1072239344Snp CTR4(KTR_CXGBE, "%s: sb_cc %d, threshold %d, resid %d", 1073239344Snp __func__, sb->sb_cc, sc->tt.ddp_thres, uio->uio_resid); 1074239344Snp } 1075239344Snp#endif 1076239344Snp 1077239344Snp /* XXX: too eager to disable DDP, could handle NBIO better than this. */ 1078239344Snp if (sb->sb_cc >= uio->uio_resid || uio->uio_resid < sc->tt.ddp_thres || 1079239344Snp uio->uio_resid > MAX_DDP_BUFFER_SIZE || uio->uio_iovcnt > 1 || 1080239344Snp so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO) || 1081239344Snp error || so->so_error || sb->sb_state & SBS_CANTRCVMORE) 1082239344Snp goto no_ddp; 1083239344Snp 1084239344Snp /* 1085239344Snp * Fault in and then hold the pages of the uio buffers. We'll wire them 1086239344Snp * a bit later if everything else works out. 1087239344Snp */ 1088239527Snp SOCKBUF_UNLOCK(sb); 1089239527Snp if (hold_uio(uio, &pages, &npages) != 0) { 1090239527Snp SOCKBUF_LOCK(sb); 1091239344Snp goto no_ddp; 1092239527Snp } 1093239527Snp SOCKBUF_LOCK(sb); 1094239527Snp if (__predict_false(so->so_error || sb->sb_state & SBS_CANTRCVMORE)) { 1095239527Snp vm_page_unhold_pages(pages, npages); 1096239527Snp free(pages, M_CXGBE); 1097239527Snp goto no_ddp; 1098239527Snp } 1099239344Snp 1100239344Snp /* 1101239344Snp * Figure out which one of the two DDP buffers to use this time. 1102239344Snp */ 1103239344Snp db_idx = select_ddp_buffer(sc, toep, pages, npages, 1104239344Snp (uintptr_t)uio->uio_iov->iov_base & PAGE_MASK, uio->uio_resid); 1105239527Snp pages = NULL; /* handed off to select_ddp_buffer */ 1106239344Snp if (db_idx < 0) 1107239344Snp goto no_ddp; 1108239344Snp db = toep->db[db_idx]; 1109239344Snp buf_flag = db_idx == 0 ? DDP_BUF0_ACTIVE : DDP_BUF1_ACTIVE; 1110239344Snp 1111239344Snp /* 1112239344Snp * Build the compound work request that tells the chip where to DMA the 1113239344Snp * payload. 1114239344Snp */ 1115239344Snp ddp_flags = select_ddp_flags(so, flags, db_idx); 1116239344Snp wr = mk_update_tcb_for_ddp(sc, toep, db_idx, sb->sb_cc, ddp_flags); 1117239344Snp if (wr == NULL) { 1118239527Snp /* 1119239527Snp * Just unhold the pages. The DDP buffer's software state is 1120239527Snp * left as-is in the toep. The page pods were written 1121239527Snp * successfully and we may have an opportunity to use it in the 1122239527Snp * future. 1123239527Snp */ 1124239527Snp vm_page_unhold_pages(db->pages, db->npages); 1125239344Snp goto no_ddp; 1126239344Snp } 1127239344Snp 1128239527Snp /* Wire (and then unhold) the pages, and give the chip the go-ahead. */ 1129239344Snp wire_ddp_buffer(db); 1130239344Snp t4_wrq_tx(sc, wr); 1131239344Snp sb->sb_flags &= ~SB_DDP_INDICATE; 1132239344Snp toep->ddp_flags |= buf_flag; 1133239344Snp 1134239344Snp /* 1135239344Snp * Wait for the DDP operation to complete and then unwire the pages. 1136239344Snp * The return code from the sbwait will be the final return code of this 1137239344Snp * function. But we do need to wait for DDP no matter what. 1138239344Snp */ 1139239344Snp rc = sbwait(sb); 1140239344Snp while (toep->ddp_flags & buf_flag) { 1141239344Snp sb->sb_flags |= SB_WAIT; 1142239344Snp msleep(&sb->sb_cc, &sb->sb_mtx, PSOCK , "sbwait", 0); 1143239344Snp } 1144239344Snp unwire_ddp_buffer(db); 1145239344Snp return (rc); 1146239344Snpno_ddp: 1147239344Snp disable_ddp(sc, toep); 1148239344Snp discourage_ddp(toep); 1149239344Snp sb->sb_flags &= ~SB_DDP_INDICATE; 1150239344Snp return (0); 1151239344Snp} 1152239344Snp 1153309556Sjhbint 1154309556Sjhbt4_init_ppod_region(struct ppod_region *pr, struct t4_range *r, u_int psz, 1155309556Sjhb const char *name) 1156239344Snp{ 1157309556Sjhb int i; 1158239344Snp 1159309556Sjhb MPASS(pr != NULL); 1160309556Sjhb MPASS(r->size > 0); 1161309556Sjhb 1162309556Sjhb pr->pr_start = r->start; 1163309556Sjhb pr->pr_len = r->size; 1164309556Sjhb pr->pr_page_shift[0] = 12 + G_HPZ0(psz); 1165309556Sjhb pr->pr_page_shift[1] = 12 + G_HPZ1(psz); 1166309556Sjhb pr->pr_page_shift[2] = 12 + G_HPZ2(psz); 1167309556Sjhb pr->pr_page_shift[3] = 12 + G_HPZ3(psz); 1168309556Sjhb 1169309556Sjhb /* The SGL -> page pod algorithm requires the sizes to be in order. */ 1170309556Sjhb for (i = 1; i < nitems(pr->pr_page_shift); i++) { 1171309556Sjhb if (pr->pr_page_shift[i] <= pr->pr_page_shift[i - 1]) 1172309556Sjhb return (ENXIO); 1173309556Sjhb } 1174309556Sjhb 1175309556Sjhb pr->pr_tag_mask = ((1 << fls(r->size)) - 1) & V_PPOD_TAG(M_PPOD_TAG); 1176309556Sjhb pr->pr_alias_mask = V_PPOD_TAG(M_PPOD_TAG) & ~pr->pr_tag_mask; 1177309556Sjhb if (pr->pr_tag_mask == 0 || pr->pr_alias_mask == 0) 1178309556Sjhb return (ENXIO); 1179309556Sjhb pr->pr_alias_shift = fls(pr->pr_tag_mask); 1180309556Sjhb pr->pr_invalid_bit = 1 << (pr->pr_alias_shift - 1); 1181309556Sjhb 1182309556Sjhb pr->pr_arena = vmem_create(name, 0, pr->pr_len, PPOD_SIZE, 0, 1183309556Sjhb M_FIRSTFIT | M_NOWAIT); 1184309556Sjhb if (pr->pr_arena == NULL) 1185309556Sjhb return (ENOMEM); 1186309556Sjhb 1187309556Sjhb return (0); 1188239344Snp} 1189239344Snp 1190239344Snpvoid 1191309556Sjhbt4_free_ppod_region(struct ppod_region *pr) 1192239344Snp{ 1193239344Snp 1194309556Sjhb MPASS(pr != NULL); 1195309556Sjhb 1196309556Sjhb if (pr->pr_arena) 1197309556Sjhb vmem_destroy(pr->pr_arena); 1198309556Sjhb bzero(pr, sizeof(*pr)); 1199239344Snp} 1200239344Snp 1201239344Snp#define VNET_SO_ASSERT(so) \ 1202239344Snp VNET_ASSERT(curvnet != NULL, \ 1203239344Snp ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 1204239344Snp#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1205239344Snpstatic int 1206239344Snpsoreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1207239344Snp{ 1208239344Snp 1209239344Snp CXGBE_UNIMPLEMENTED(__func__); 1210239344Snp} 1211239344Snp 1212250218Snpstatic char ddp_magic_str[] = "nothing to see here"; 1213250218Snp 1214308281Sjhbstatic struct mbuf * 1215250218Snpget_ddp_mbuf(int len) 1216250218Snp{ 1217250218Snp struct mbuf *m; 1218250218Snp 1219250218Snp m = m_get(M_NOWAIT, MT_DATA); 1220250218Snp if (m == NULL) 1221250218Snp CXGBE_UNIMPLEMENTED("mbuf alloc failure"); 1222250218Snp m->m_len = len; 1223250218Snp m->m_data = &ddp_magic_str[0]; 1224250218Snp 1225250218Snp return (m); 1226250218Snp} 1227250218Snp 1228250218Snpstatic inline int 1229250218Snpis_ddp_mbuf(struct mbuf *m) 1230250218Snp{ 1231250218Snp 1232250218Snp return (m->m_data == &ddp_magic_str[0]); 1233250218Snp} 1234250218Snp 1235239344Snp/* 1236239344Snp * Copy an mbuf chain into a uio limited by len if set. 1237239344Snp */ 1238239344Snpstatic int 1239239344Snpm_mbuftouio_ddp(struct uio *uio, struct mbuf *m, int len) 1240239344Snp{ 1241239344Snp int error, length, total; 1242239344Snp int progress = 0; 1243239344Snp 1244239344Snp if (len > 0) 1245239344Snp total = min(uio->uio_resid, len); 1246239344Snp else 1247239344Snp total = uio->uio_resid; 1248239344Snp 1249239344Snp /* Fill the uio with data from the mbufs. */ 1250239344Snp for (; m != NULL; m = m->m_next) { 1251239344Snp length = min(m->m_len, total - progress); 1252239344Snp 1253250218Snp if (is_ddp_mbuf(m)) { 1254239344Snp enum uio_seg segflag = uio->uio_segflg; 1255239344Snp 1256239344Snp uio->uio_segflg = UIO_NOCOPY; 1257239344Snp error = uiomove(mtod(m, void *), length, uio); 1258239344Snp uio->uio_segflg = segflag; 1259239344Snp } else 1260239344Snp error = uiomove(mtod(m, void *), length, uio); 1261239344Snp if (error) 1262239344Snp return (error); 1263239344Snp 1264239344Snp progress += length; 1265239344Snp } 1266239344Snp 1267239344Snp return (0); 1268239344Snp} 1269239344Snp 1270239344Snp/* 1271239344Snp * Based on soreceive_stream() in uipc_socket.c 1272239344Snp */ 1273239344Snpint 1274239344Snpt4_soreceive_ddp(struct socket *so, struct sockaddr **psa, struct uio *uio, 1275239344Snp struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1276239344Snp{ 1277239344Snp int len = 0, error = 0, flags, oresid, ddp_handled = 0; 1278239344Snp struct sockbuf *sb; 1279239344Snp struct mbuf *m, *n = NULL; 1280239344Snp 1281239344Snp /* We only do stream sockets. */ 1282239344Snp if (so->so_type != SOCK_STREAM) 1283239344Snp return (EINVAL); 1284239344Snp if (psa != NULL) 1285239344Snp *psa = NULL; 1286239344Snp if (controlp != NULL) 1287239344Snp return (EINVAL); 1288239344Snp if (flagsp != NULL) 1289239344Snp flags = *flagsp &~ MSG_EOR; 1290239344Snp else 1291239344Snp flags = 0; 1292239344Snp if (flags & MSG_OOB) 1293239344Snp return (soreceive_rcvoob(so, uio, flags)); 1294239344Snp if (mp0 != NULL) 1295239344Snp *mp0 = NULL; 1296239344Snp 1297239344Snp sb = &so->so_rcv; 1298239344Snp 1299239344Snp /* Prevent other readers from entering the socket. */ 1300239344Snp error = sblock(sb, SBLOCKWAIT(flags)); 1301308281Sjhb SOCKBUF_LOCK(sb); 1302239344Snp if (error) 1303239344Snp goto out; 1304239344Snp 1305239344Snp /* Easy one, no space to copyout anything. */ 1306239344Snp if (uio->uio_resid == 0) { 1307239344Snp error = EINVAL; 1308239344Snp goto out; 1309239344Snp } 1310239344Snp oresid = uio->uio_resid; 1311239344Snp 1312239344Snp /* We will never ever get anything unless we are or were connected. */ 1313239344Snp if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1314239344Snp error = ENOTCONN; 1315239344Snp goto out; 1316239344Snp } 1317239344Snp 1318239344Snprestart: 1319239344Snp SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1320239344Snp 1321239344Snp if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) { 1322239344Snp 1323239344Snp /* uio should be just as it was at entry */ 1324239344Snp KASSERT(oresid == uio->uio_resid, 1325239344Snp ("%s: oresid = %d, uio_resid = %zd, sb_cc = %d", 1326239344Snp __func__, oresid, uio->uio_resid, sb->sb_cc)); 1327239344Snp 1328239344Snp error = handle_ddp(so, uio, flags, 0); 1329239344Snp ddp_handled = 1; 1330239344Snp if (error) 1331239344Snp goto out; 1332239344Snp } 1333239344Snp 1334239344Snp /* Abort if socket has reported problems. */ 1335239344Snp if (so->so_error) { 1336239344Snp if (sb->sb_cc > 0) 1337239344Snp goto deliver; 1338239344Snp if (oresid > uio->uio_resid) 1339239344Snp goto out; 1340239344Snp error = so->so_error; 1341239344Snp if (!(flags & MSG_PEEK)) 1342239344Snp so->so_error = 0; 1343239344Snp goto out; 1344239344Snp } 1345239344Snp 1346239344Snp /* Door is closed. Deliver what is left, if any. */ 1347239344Snp if (sb->sb_state & SBS_CANTRCVMORE) { 1348239344Snp if (sb->sb_cc > 0) 1349239344Snp goto deliver; 1350239344Snp else 1351239344Snp goto out; 1352239344Snp } 1353239344Snp 1354239344Snp /* Socket buffer is empty and we shall not block. */ 1355239344Snp if (sb->sb_cc == 0 && 1356239344Snp ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 1357239344Snp error = EAGAIN; 1358239344Snp goto out; 1359239344Snp } 1360239344Snp 1361239344Snp /* Socket buffer got some data that we shall deliver now. */ 1362239344Snp if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) && 1363278319Sjhb ((so->so_state & SS_NBIO) || 1364239344Snp (flags & (MSG_DONTWAIT|MSG_NBIO)) || 1365239344Snp sb->sb_cc >= sb->sb_lowat || 1366239344Snp sb->sb_cc >= uio->uio_resid || 1367239344Snp sb->sb_cc >= sb->sb_hiwat) ) { 1368239344Snp goto deliver; 1369239344Snp } 1370239344Snp 1371239344Snp /* On MSG_WAITALL we must wait until all data or error arrives. */ 1372239344Snp if ((flags & MSG_WAITALL) && 1373239344Snp (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat)) 1374239344Snp goto deliver; 1375239344Snp 1376239344Snp /* 1377239344Snp * Wait and block until (more) data comes in. 1378239344Snp * NB: Drops the sockbuf lock during wait. 1379239344Snp */ 1380239344Snp error = sbwait(sb); 1381239344Snp if (error) { 1382239344Snp if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) { 1383239344Snp (void) handle_ddp(so, uio, flags, 1); 1384239344Snp ddp_handled = 1; 1385239344Snp } 1386239344Snp goto out; 1387239344Snp } 1388239344Snp goto restart; 1389239344Snp 1390239344Snpdeliver: 1391239344Snp SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1392239344Snp KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__)); 1393239344Snp KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 1394239344Snp 1395239344Snp if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) 1396239344Snp goto restart; 1397239344Snp 1398239344Snp /* Statistics. */ 1399239344Snp if (uio->uio_td) 1400239344Snp uio->uio_td->td_ru.ru_msgrcv++; 1401239344Snp 1402239344Snp /* Fill uio until full or current end of socket buffer is reached. */ 1403239344Snp len = min(uio->uio_resid, sb->sb_cc); 1404239344Snp if (mp0 != NULL) { 1405239344Snp /* Dequeue as many mbufs as possible. */ 1406239344Snp if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 1407239344Snp for (*mp0 = m = sb->sb_mb; 1408239344Snp m != NULL && m->m_len <= len; 1409239344Snp m = m->m_next) { 1410239344Snp len -= m->m_len; 1411239344Snp uio->uio_resid -= m->m_len; 1412239344Snp sbfree(sb, m); 1413239344Snp n = m; 1414239344Snp } 1415239344Snp sb->sb_mb = m; 1416239344Snp if (sb->sb_mb == NULL) 1417239344Snp SB_EMPTY_FIXUP(sb); 1418239344Snp n->m_next = NULL; 1419239344Snp } 1420239344Snp /* Copy the remainder. */ 1421239344Snp if (len > 0) { 1422239344Snp KASSERT(sb->sb_mb != NULL, 1423239344Snp ("%s: len > 0 && sb->sb_mb empty", __func__)); 1424239344Snp 1425243857Sglebius m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 1426239344Snp if (m == NULL) 1427239344Snp len = 0; /* Don't flush data from sockbuf. */ 1428239344Snp else 1429239344Snp uio->uio_resid -= m->m_len; 1430239344Snp if (*mp0 != NULL) 1431239344Snp n->m_next = m; 1432239344Snp else 1433239344Snp *mp0 = m; 1434239344Snp if (*mp0 == NULL) { 1435239344Snp error = ENOBUFS; 1436239344Snp goto out; 1437239344Snp } 1438239344Snp } 1439239344Snp } else { 1440239344Snp /* NB: Must unlock socket buffer as uiomove may sleep. */ 1441239344Snp SOCKBUF_UNLOCK(sb); 1442239344Snp error = m_mbuftouio_ddp(uio, sb->sb_mb, len); 1443239344Snp SOCKBUF_LOCK(sb); 1444239344Snp if (error) 1445239344Snp goto out; 1446239344Snp } 1447239344Snp SBLASTRECORDCHK(sb); 1448239344Snp SBLASTMBUFCHK(sb); 1449239344Snp 1450239344Snp /* 1451239344Snp * Remove the delivered data from the socket buffer unless we 1452239344Snp * were only peeking. 1453239344Snp */ 1454239344Snp if (!(flags & MSG_PEEK)) { 1455239344Snp if (len > 0) 1456239344Snp sbdrop_locked(sb, len); 1457239344Snp 1458239344Snp /* Notify protocol that we drained some data. */ 1459239344Snp if ((so->so_proto->pr_flags & PR_WANTRCVD) && 1460239344Snp (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 1461239344Snp !(flags & MSG_SOCALLBCK))) { 1462239344Snp SOCKBUF_UNLOCK(sb); 1463239344Snp VNET_SO_ASSERT(so); 1464239344Snp (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); 1465239344Snp SOCKBUF_LOCK(sb); 1466239344Snp } 1467239344Snp } 1468239344Snp 1469239344Snp /* 1470239344Snp * For MSG_WAITALL we may have to loop again and wait for 1471239344Snp * more data to come in. 1472239344Snp */ 1473239344Snp if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 1474239344Snp goto restart; 1475239344Snpout: 1476239344Snp SOCKBUF_LOCK_ASSERT(sb); 1477239344Snp SBLASTRECORDCHK(sb); 1478239344Snp SBLASTMBUFCHK(sb); 1479239344Snp SOCKBUF_UNLOCK(sb); 1480239344Snp sbunlock(sb); 1481239344Snp return (error); 1482239344Snp} 1483239344Snp 1484309442Sjhbint 1485309442Sjhbt4_ddp_mod_load(void) 1486309442Sjhb{ 1487309442Sjhb 1488309442Sjhb t4_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); 1489309442Sjhb t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 1490309442Sjhb return (0); 1491309442Sjhb} 1492309442Sjhb 1493309442Sjhbvoid 1494309442Sjhbt4_ddp_mod_unload(void) 1495309442Sjhb{ 1496309442Sjhb 1497309442Sjhb t4_register_cpl_handler(CPL_RX_DATA_DDP, NULL); 1498309442Sjhb t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, NULL); 1499309442Sjhb} 1500239344Snp#endif 1501