1239344Snp/*-
2239344Snp * Copyright (c) 2012 Chelsio Communications, Inc.
3239344Snp * All rights reserved.
4239344Snp * Written by: Navdeep Parhar <np@FreeBSD.org>
5239344Snp *
6239344Snp * Redistribution and use in source and binary forms, with or without
7239344Snp * modification, are permitted provided that the following conditions
8239344Snp * are met:
9239344Snp * 1. Redistributions of source code must retain the above copyright
10239344Snp *    notice, this list of conditions and the following disclaimer.
11239344Snp * 2. Redistributions in binary form must reproduce the above copyright
12239344Snp *    notice, this list of conditions and the following disclaimer in the
13239344Snp *    documentation and/or other materials provided with the distribution.
14239344Snp *
15239344Snp * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16239344Snp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17239344Snp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18239344Snp * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19239344Snp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20239344Snp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21239344Snp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22239344Snp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23239344Snp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24239344Snp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25239344Snp * SUCH DAMAGE.
26239344Snp */
27239344Snp
28239344Snp#include <sys/cdefs.h>
29239344Snp__FBSDID("$FreeBSD: stable/10/sys/dev/cxgbe/tom/t4_ddp.c 312337 2017-01-17 07:43:37Z np $");
30239344Snp
31239344Snp#include "opt_inet.h"
32239344Snp
33239344Snp#include <sys/param.h>
34239344Snp#include <sys/types.h>
35239344Snp#include <sys/systm.h>
36239344Snp#include <sys/kernel.h>
37239344Snp#include <sys/ktr.h>
38239344Snp#include <sys/module.h>
39239344Snp#include <sys/protosw.h>
40239344Snp#include <sys/proc.h>
41239344Snp#include <sys/domain.h>
42239344Snp#include <sys/socket.h>
43239344Snp#include <sys/socketvar.h>
44239344Snp#include <sys/uio.h>
45239344Snp#include <netinet/in.h>
46239344Snp#include <netinet/in_pcb.h>
47239344Snp#include <netinet/ip.h>
48239344Snp#include <netinet/tcp_var.h>
49239344Snp#define TCPSTATES
50239344Snp#include <netinet/tcp_fsm.h>
51239344Snp#include <netinet/toecore.h>
52239344Snp
53239344Snp#include <vm/vm.h>
54239344Snp#include <vm/vm_extern.h>
55239344Snp#include <vm/vm_param.h>
56239344Snp#include <vm/pmap.h>
57239344Snp#include <vm/vm_map.h>
58239344Snp#include <vm/vm_page.h>
59239344Snp#include <vm/vm_object.h>
60239344Snp
61239344Snp#ifdef TCP_OFFLOAD
62239344Snp#include "common/common.h"
63239344Snp#include "common/t4_msg.h"
64239344Snp#include "common/t4_regs.h"
65239344Snp#include "common/t4_tcb.h"
66239344Snp#include "tom/t4_tom.h"
67239344Snp
68281315SjhbVNET_DECLARE(int, tcp_do_autorcvbuf);
69281315Sjhb#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf)
70281315SjhbVNET_DECLARE(int, tcp_autorcvbuf_inc);
71281315Sjhb#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc)
72281315SjhbVNET_DECLARE(int, tcp_autorcvbuf_max);
73281315Sjhb#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max)
74281315Sjhb
75308281Sjhbstatic struct mbuf *get_ddp_mbuf(int len);
76308281Sjhb
77239344Snp#define MAX_DDP_BUFFER_SIZE		(M_TCB_RX_DDP_BUF0_LEN)
78239344Snp
79309556Sjhbstatic struct ddp_buffer *
80309556Sjhballoc_ddp_buffer(vm_page_t *pages, int npages, int offset, int len)
81239344Snp{
82309556Sjhb	struct ddp_buffer *db;
83239344Snp
84309556Sjhb	db = malloc(sizeof(*db), M_CXGBE, M_NOWAIT | M_ZERO);
85309556Sjhb	if (db == NULL) {
86309556Sjhb		CTR1(KTR_CXGBE, "%s: malloc failed.", __func__);
87309556Sjhb		return (NULL);
88309556Sjhb	}
89239344Snp
90309556Sjhb	db->npages = npages;
91309556Sjhb	db->pages = pages;
92309556Sjhb	db->offset = offset;
93309556Sjhb	db->len = len;
94239344Snp
95309556Sjhb	return (db);
96239344Snp}
97239344Snp
98239344Snpstatic void
99309556Sjhbfree_ddp_buffer(struct ddp_buffer *db)
100239344Snp{
101239344Snp
102239344Snp	if (db == NULL)
103239344Snp		return;
104239344Snp
105239344Snp	if (db->pages)
106239344Snp		free(db->pages, M_CXGBE);
107239344Snp
108309556Sjhb	if (db->prsv.prsv_nppods > 0)
109309556Sjhb		t4_free_page_pods(&db->prsv);
110239344Snp
111239344Snp	free(db, M_CXGBE);
112239344Snp}
113239344Snp
114239344Snpvoid
115239344Snprelease_ddp_resources(struct toepcb *toep)
116239344Snp{
117239344Snp	int i;
118239344Snp
119240452Snp	for (i = 0; i < nitems(toep->db); i++) {
120239344Snp		if (toep->db[i] != NULL) {
121309556Sjhb			free_ddp_buffer(toep->db[i]);
122239344Snp			toep->db[i] = NULL;
123239344Snp		}
124239344Snp	}
125239344Snp}
126239344Snp
127243681Snp/* XXX: handle_ddp_data code duplication */
128243681Snpvoid
129243681Snpinsert_ddp_data(struct toepcb *toep, uint32_t n)
130243681Snp{
131243681Snp	struct inpcb *inp = toep->inp;
132243681Snp	struct tcpcb *tp = intotcpcb(inp);
133243681Snp	struct sockbuf *sb = &inp->inp_socket->so_rcv;
134243681Snp	struct mbuf *m;
135243681Snp
136243681Snp	INP_WLOCK_ASSERT(inp);
137243681Snp	SOCKBUF_LOCK_ASSERT(sb);
138243681Snp
139250218Snp	m = get_ddp_mbuf(n);
140243681Snp	tp->rcv_nxt += n;
141243681Snp#ifndef USE_DDP_RX_FLOW_CONTROL
142243681Snp	KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
143243681Snp	tp->rcv_wnd -= n;
144243681Snp#endif
145243681Snp
146243681Snp	KASSERT(toep->sb_cc >= sb->sb_cc,
147243681Snp	    ("%s: sb %p has more data (%d) than last time (%d).",
148243681Snp	    __func__, sb, sb->sb_cc, toep->sb_cc));
149243681Snp	toep->rx_credits += toep->sb_cc - sb->sb_cc;
150243681Snp#ifdef USE_DDP_RX_FLOW_CONTROL
151243681Snp	toep->rx_credits -= n;	/* adjust for F_RX_FC_DDP */
152243681Snp#endif
153243681Snp	sbappendstream_locked(sb, m);
154243681Snp	toep->sb_cc = sb->sb_cc;
155243681Snp}
156243681Snp
157239344Snp/* SET_TCB_FIELD sent as a ULP command looks like this */
158239344Snp#define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
159239344Snp    sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
160239344Snp
161239344Snp/* RX_DATA_ACK sent as a ULP command looks like this */
162239344Snp#define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \
163239344Snp    sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core))
164239344Snp
165239344Snpstatic inline void *
166239344Snpmk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep,
167239344Snp    uint64_t word, uint64_t mask, uint64_t val)
168239344Snp{
169239344Snp	struct ulptx_idata *ulpsc;
170239344Snp	struct cpl_set_tcb_field_core *req;
171239344Snp
172239344Snp	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
173239344Snp	ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
174239344Snp
175239344Snp	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
176239344Snp	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
177239344Snp	ulpsc->len = htobe32(sizeof(*req));
178239344Snp
179239344Snp	req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
180239344Snp	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid));
181239344Snp	req->reply_ctrl = htobe16(V_NO_REPLY(1) |
182239344Snp	    V_QUEUENO(toep->ofld_rxq->iq.abs_id));
183239344Snp	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
184239344Snp        req->mask = htobe64(mask);
185239344Snp        req->val = htobe64(val);
186239344Snp
187239344Snp	ulpsc = (struct ulptx_idata *)(req + 1);
188239344Snp	if (LEN__SET_TCB_FIELD_ULP % 16) {
189239344Snp		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
190239344Snp		ulpsc->len = htobe32(0);
191239344Snp		return (ulpsc + 1);
192239344Snp	}
193239344Snp	return (ulpsc);
194239344Snp}
195239344Snp
196239344Snpstatic inline void *
197239344Snpmk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep)
198239344Snp{
199239344Snp	struct ulptx_idata *ulpsc;
200239344Snp	struct cpl_rx_data_ack_core *req;
201239344Snp
202239344Snp	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
203239344Snp	ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16));
204239344Snp
205239344Snp	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
206239344Snp	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
207239344Snp	ulpsc->len = htobe32(sizeof(*req));
208239344Snp
209239344Snp	req = (struct cpl_rx_data_ack_core *)(ulpsc + 1);
210239344Snp	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid));
211239344Snp	req->credit_dack = htobe32(F_RX_MODULATE_RX);
212239344Snp
213239344Snp	ulpsc = (struct ulptx_idata *)(req + 1);
214239344Snp	if (LEN__RX_DATA_ACK_ULP % 16) {
215239344Snp		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
216239344Snp		ulpsc->len = htobe32(0);
217239344Snp		return (ulpsc + 1);
218239344Snp	}
219239344Snp	return (ulpsc);
220239344Snp}
221239344Snp
222239344Snpstatic inline uint64_t
223239344Snpselect_ddp_flags(struct socket *so, int flags, int db_idx)
224239344Snp{
225239344Snp	uint64_t ddp_flags = V_TF_DDP_INDICATE_OUT(0);
226239344Snp	int waitall = flags & MSG_WAITALL;
227239344Snp	int nb = so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO);
228239344Snp
229239344Snp	KASSERT(db_idx == 0 || db_idx == 1,
230239344Snp	    ("%s: bad DDP buffer index %d", __func__, db_idx));
231239344Snp
232239344Snp	if (db_idx == 0) {
233239344Snp		ddp_flags |= V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0);
234239344Snp		if (waitall)
235239344Snp			ddp_flags |= V_TF_DDP_PUSH_DISABLE_0(1);
236239344Snp		else if (nb)
237239344Snp			ddp_flags |= V_TF_DDP_BUF0_FLUSH(1);
238239344Snp		else
239239344Snp			ddp_flags |= V_TF_DDP_BUF0_FLUSH(0);
240239344Snp	} else {
241239344Snp		ddp_flags |= V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1);
242239344Snp		if (waitall)
243239344Snp			ddp_flags |= V_TF_DDP_PUSH_DISABLE_1(1);
244239344Snp		else if (nb)
245239344Snp			ddp_flags |= V_TF_DDP_BUF1_FLUSH(1);
246239344Snp		else
247239344Snp			ddp_flags |= V_TF_DDP_BUF1_FLUSH(0);
248239344Snp	}
249239344Snp
250239344Snp	return (ddp_flags);
251239344Snp}
252239344Snp
253239344Snpstatic struct wrqe *
254239344Snpmk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
255239344Snp    int offset, uint64_t ddp_flags)
256239344Snp{
257239344Snp	struct ddp_buffer *db = toep->db[db_idx];
258239344Snp	struct wrqe *wr;
259239344Snp	struct work_request_hdr *wrh;
260239344Snp	struct ulp_txpkt *ulpmc;
261239344Snp	int len;
262239344Snp
263239344Snp	KASSERT(db_idx == 0 || db_idx == 1,
264239344Snp	    ("%s: bad DDP buffer index %d", __func__, db_idx));
265239344Snp
266239344Snp	/*
267239344Snp	 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an
268239344Snp	 * RX_DATA_ACK (with RX_MODULATE to speed up delivery).
269239344Snp	 *
270239344Snp	 * The work request header is 16B and always ends at a 16B boundary.
271239344Snp	 * The ULPTX master commands that follow must all end at 16B boundaries
272239344Snp	 * too so we round up the size to 16.
273239344Snp	 */
274248925Snp	len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) +
275248925Snp	    roundup2(LEN__RX_DATA_ACK_ULP, 16);
276239344Snp
277239344Snp	wr = alloc_wrqe(len, toep->ctrlq);
278239344Snp	if (wr == NULL)
279239344Snp		return (NULL);
280239344Snp	wrh = wrtod(wr);
281239344Snp	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
282239344Snp	ulpmc = (struct ulp_txpkt *)(wrh + 1);
283239344Snp
284239344Snp	/* Write the buffer's tag */
285239344Snp	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
286239344Snp	    W_TCB_RX_DDP_BUF0_TAG + db_idx,
287239344Snp	    V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
288309556Sjhb	    V_TCB_RX_DDP_BUF0_TAG(db->prsv.prsv_tag));
289239344Snp
290239344Snp	/* Update the current offset in the DDP buffer and its total length */
291239344Snp	if (db_idx == 0)
292239344Snp		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
293239344Snp		    W_TCB_RX_DDP_BUF0_OFFSET,
294239344Snp		    V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
295239344Snp		    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
296239344Snp		    V_TCB_RX_DDP_BUF0_OFFSET(offset) |
297239344Snp		    V_TCB_RX_DDP_BUF0_LEN(db->len));
298239344Snp	else
299239344Snp		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
300239344Snp		    W_TCB_RX_DDP_BUF1_OFFSET,
301239344Snp		    V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
302239344Snp		    V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
303239344Snp		    V_TCB_RX_DDP_BUF1_OFFSET(offset) |
304239344Snp		    V_TCB_RX_DDP_BUF1_LEN((u64)db->len << 32));
305239344Snp
306239344Snp	/* Update DDP flags */
307239344Snp	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS,
308239344Snp	    V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF1_FLUSH(1) |
309239344Snp	    V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PUSH_DISABLE_1(1) |
310239344Snp	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1) |
311239344Snp	    V_TF_DDP_ACTIVE_BUF(1) | V_TF_DDP_INDICATE_OUT(1), ddp_flags);
312239344Snp
313239344Snp	/* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */
314239344Snp	ulpmc = mk_rx_data_ack_ulp(ulpmc, toep);
315239344Snp
316239344Snp	return (wr);
317239344Snp}
318239344Snp
319239344Snpstatic void
320239344Snpdiscourage_ddp(struct toepcb *toep)
321239344Snp{
322239344Snp
323239344Snp	if (toep->ddp_score && --toep->ddp_score == 0) {
324239344Snp		toep->ddp_flags &= ~DDP_OK;
325239344Snp		toep->ddp_disabled = time_uptime;
326239344Snp		CTR3(KTR_CXGBE, "%s: tid %u !DDP_OK @ %u",
327239344Snp		    __func__, toep->tid, time_uptime);
328239344Snp	}
329239344Snp}
330239344Snp
331239344Snpstatic int
332239344Snphandle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
333239344Snp{
334239344Snp	uint32_t report = be32toh(ddp_report);
335239344Snp	unsigned int db_flag;
336239344Snp	struct inpcb *inp = toep->inp;
337239344Snp	struct tcpcb *tp;
338239344Snp	struct socket *so;
339239344Snp	struct sockbuf *sb;
340239344Snp	struct mbuf *m;
341239344Snp
342239344Snp	db_flag = report & F_DDP_BUF_IDX ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
343239344Snp
344239344Snp	if (__predict_false(!(report & F_DDP_INV)))
345239344Snp		CXGBE_UNIMPLEMENTED("DDP buffer still valid");
346239344Snp
347239344Snp	INP_WLOCK(inp);
348239344Snp	so = inp_inpcbtosocket(inp);
349239344Snp	sb = &so->so_rcv;
350239344Snp	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
351239344Snp
352239344Snp		/*
353239344Snp		 * XXX: think a bit more.
354239344Snp		 * tcpcb probably gone, but socket should still be around
355239344Snp		 * because we always wait for DDP completion in soreceive no
356239344Snp		 * matter what.  Just wake it up and let it clean up.
357239344Snp		 */
358239344Snp
359239344Snp		CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
360239344Snp		    __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
361239344Snp		SOCKBUF_LOCK(sb);
362239344Snp		goto wakeup;
363239344Snp	}
364239344Snp
365239344Snp	tp = intotcpcb(inp);
366308281Sjhb
367308281Sjhb	/*
368308281Sjhb	 * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the
369308281Sjhb	 * sequence number of the next byte to receive.  The length of
370308281Sjhb	 * the data received for this message must be computed by
371308281Sjhb	 * comparing the new and old values of rcv_nxt.
372308281Sjhb	 *
373308281Sjhb	 * For RX_DATA_DDP, len might be non-zero, but it is only the
374308281Sjhb	 * length of the most recent DMA.  It does not include the
375308281Sjhb	 * total length of the data received since the previous update
376308281Sjhb	 * for this DDP buffer.  rcv_nxt is the sequence number of the
377308281Sjhb	 * first received byte from the most recent DMA.
378308281Sjhb	 */
379239344Snp	len += be32toh(rcv_nxt) - tp->rcv_nxt;
380239344Snp	tp->rcv_nxt += len;
381239344Snp	tp->t_rcvtime = ticks;
382239344Snp#ifndef USE_DDP_RX_FLOW_CONTROL
383239344Snp	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
384239344Snp	tp->rcv_wnd -= len;
385239344Snp#endif
386250218Snp	m = get_ddp_mbuf(len);
387239344Snp
388239344Snp	SOCKBUF_LOCK(sb);
389239344Snp	if (report & F_DDP_BUF_COMPLETE)
390239344Snp		toep->ddp_score = DDP_HIGH_SCORE;
391239344Snp	else
392239344Snp		discourage_ddp(toep);
393239344Snp
394281315Sjhb	/* receive buffer autosize */
395312117Snp	MPASS(toep->vnet == so->so_vnet);
396312117Snp	CURVNET_SET(toep->vnet);
397281315Sjhb	if (sb->sb_flags & SB_AUTOSIZE &&
398281315Sjhb	    V_tcp_do_autorcvbuf &&
399281315Sjhb	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
400281315Sjhb	    len > (sbspace(sb) / 8 * 7)) {
401281315Sjhb		unsigned int hiwat = sb->sb_hiwat;
402281315Sjhb		unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc,
403281315Sjhb		    V_tcp_autorcvbuf_max);
404281315Sjhb
405281315Sjhb		if (!sbreserve_locked(sb, newsize, so, NULL))
406281315Sjhb			sb->sb_flags &= ~SB_AUTOSIZE;
407281315Sjhb		else
408281315Sjhb			toep->rx_credits += newsize - hiwat;
409281315Sjhb	}
410312117Snp	CURVNET_RESTORE();
411281315Sjhb
412239344Snp	KASSERT(toep->sb_cc >= sb->sb_cc,
413239344Snp	    ("%s: sb %p has more data (%d) than last time (%d).",
414239344Snp	    __func__, sb, sb->sb_cc, toep->sb_cc));
415239344Snp	toep->rx_credits += toep->sb_cc - sb->sb_cc;
416239344Snp#ifdef USE_DDP_RX_FLOW_CONTROL
417239344Snp	toep->rx_credits -= len;	/* adjust for F_RX_FC_DDP */
418239344Snp#endif
419239344Snp	sbappendstream_locked(sb, m);
420239344Snp	toep->sb_cc = sb->sb_cc;
421239344Snpwakeup:
422239344Snp	KASSERT(toep->ddp_flags & db_flag,
423239344Snp	    ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x, report 0x%x",
424239344Snp	    __func__, toep, toep->ddp_flags, report));
425239344Snp	toep->ddp_flags &= ~db_flag;
426239344Snp	sorwakeup_locked(so);
427239344Snp	SOCKBUF_UNLOCK_ASSERT(sb);
428239344Snp
429239344Snp	INP_WUNLOCK(inp);
430239344Snp	return (0);
431239344Snp}
432239344Snp
433308281Sjhbvoid
434308281Sjhbhandle_ddp_close(struct toepcb *toep, struct tcpcb *tp, struct sockbuf *sb,
435308281Sjhb    __be32 rcv_nxt)
436308281Sjhb{
437308281Sjhb	struct mbuf *m;
438308281Sjhb	int len;
439308281Sjhb
440308281Sjhb	SOCKBUF_LOCK_ASSERT(sb);
441308281Sjhb	INP_WLOCK_ASSERT(toep->inp);
442308281Sjhb	len = be32toh(rcv_nxt) - tp->rcv_nxt;
443308281Sjhb
444308281Sjhb	/* Signal handle_ddp() to break out of its sleep loop. */
445308281Sjhb	toep->ddp_flags &= ~(DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE);
446308281Sjhb	if (len == 0)
447308281Sjhb		return;
448308281Sjhb
449308281Sjhb	tp->rcv_nxt += len;
450308281Sjhb	KASSERT(toep->sb_cc >= sb->sb_cc,
451308281Sjhb	    ("%s: sb %p has more data (%d) than last time (%d).",
452308281Sjhb	    __func__, sb, sb->sb_cc, toep->sb_cc));
453308281Sjhb	toep->rx_credits += toep->sb_cc - sb->sb_cc;
454308281Sjhb#ifdef USE_DDP_RX_FLOW_CONTROL
455308281Sjhb	toep->rx_credits -= len;	/* adjust for F_RX_FC_DDP */
456308281Sjhb#endif
457308281Sjhb
458308281Sjhb	m = get_ddp_mbuf(len);
459308281Sjhb
460308281Sjhb	sbappendstream_locked(sb, m);
461308281Sjhb	toep->sb_cc = sb->sb_cc;
462308281Sjhb}
463308281Sjhb
464239344Snp#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
465239344Snp	 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
466239344Snp	 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
467239344Snp	 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)
468239344Snp
469309442Sjhbextern cpl_handler_t t4_cpl_handler[];
470309442Sjhb
471239344Snpstatic int
472239344Snpdo_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
473239344Snp{
474239344Snp	struct adapter *sc = iq->adapter;
475239344Snp	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
476239344Snp	unsigned int tid = GET_TID(cpl);
477239344Snp	uint32_t vld;
478239344Snp	struct toepcb *toep = lookup_tid(sc, tid);
479239344Snp
480239344Snp	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
481239344Snp	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
482239514Snp	KASSERT(!(toep->flags & TPF_SYNQE),
483239344Snp	    ("%s: toep %p claims to be a synq entry", __func__, toep));
484239344Snp
485239344Snp	vld = be32toh(cpl->ddpvld);
486239344Snp	if (__predict_false(vld & DDP_ERR)) {
487239344Snp		panic("%s: DDP error 0x%x (tid %d, toep %p)",
488239344Snp		    __func__, vld, tid, toep);
489239344Snp	}
490309440Sjhb
491270297Snp	if (toep->ulp_mode == ULP_MODE_ISCSI) {
492309442Sjhb		t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m);
493309440Sjhb		return (0);
494309440Sjhb	}
495239344Snp
496239344Snp	handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len));
497239344Snp
498239344Snp	return (0);
499239344Snp}
500239344Snp
501239344Snpstatic int
502239344Snpdo_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss,
503239344Snp    struct mbuf *m)
504239344Snp{
505239344Snp	struct adapter *sc = iq->adapter;
506239344Snp	const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1);
507239344Snp	unsigned int tid = GET_TID(cpl);
508239344Snp	struct toepcb *toep = lookup_tid(sc, tid);
509239344Snp
510239344Snp	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
511239344Snp	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
512239514Snp	KASSERT(!(toep->flags & TPF_SYNQE),
513239344Snp	    ("%s: toep %p claims to be a synq entry", __func__, toep));
514239344Snp
515239344Snp	handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0);
516239344Snp
517239344Snp	return (0);
518239344Snp}
519239344Snp
520239344Snpvoid
521239344Snpenable_ddp(struct adapter *sc, struct toepcb *toep)
522239344Snp{
523239344Snp
524239344Snp	KASSERT((toep->ddp_flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
525239344Snp	    ("%s: toep %p has bad ddp_flags 0x%x",
526239344Snp	    __func__, toep, toep->ddp_flags));
527239344Snp
528239344Snp	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
529239344Snp	    __func__, toep->tid, time_uptime);
530239344Snp
531239344Snp	toep->ddp_flags |= DDP_SC_REQ;
532309442Sjhb	t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_RX_DDP_FLAGS,
533239344Snp	    V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
534239344Snp	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
535239344Snp	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1),
536309442Sjhb	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1), 0, 0,
537309442Sjhb	    toep->ofld_rxq->iq.abs_id);
538309442Sjhb	t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS,
539309442Sjhb	    V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0, toep->ofld_rxq->iq.abs_id);
540239344Snp}
541239344Snp
542239344Snpstatic inline void
543239344Snpdisable_ddp(struct adapter *sc, struct toepcb *toep)
544239344Snp{
545239344Snp
546239344Snp	KASSERT((toep->ddp_flags & (DDP_ON | DDP_SC_REQ)) == DDP_ON,
547239344Snp	    ("%s: toep %p has bad ddp_flags 0x%x",
548239344Snp	    __func__, toep, toep->ddp_flags));
549239344Snp
550239344Snp	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
551239344Snp	    __func__, toep->tid, time_uptime);
552239344Snp
553239344Snp	toep->ddp_flags |= DDP_SC_REQ;
554309442Sjhb	t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS,
555309442Sjhb	    V_TF_RCV_COALESCE_ENABLE(1), V_TF_RCV_COALESCE_ENABLE(1), 0, 0,
556309442Sjhb	    toep->ofld_rxq->iq.abs_id);
557309442Sjhb	t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_RX_DDP_FLAGS,
558309442Sjhb	    V_TF_DDP_OFF(1), V_TF_DDP_OFF(1), 0, 0, toep->ofld_rxq->iq.abs_id);
559239344Snp}
560239344Snp
561239344Snpstatic int
562239344Snphold_uio(struct uio *uio, vm_page_t **ppages, int *pnpages)
563239344Snp{
564239344Snp	struct vm_map *map;
565239344Snp	struct iovec *iov;
566239344Snp	vm_offset_t start, end;
567239344Snp	vm_page_t *pp;
568239344Snp	int n;
569239344Snp
570239344Snp	KASSERT(uio->uio_iovcnt == 1,
571239344Snp	    ("%s: uio_iovcnt %d", __func__, uio->uio_iovcnt));
572239344Snp	KASSERT(uio->uio_td->td_proc == curproc,
573239344Snp	    ("%s: uio proc (%p) is not curproc (%p)",
574239344Snp	    __func__, uio->uio_td->td_proc, curproc));
575239344Snp
576239344Snp	map = &curproc->p_vmspace->vm_map;
577239344Snp	iov = &uio->uio_iov[0];
578239344Snp	start = trunc_page((uintptr_t)iov->iov_base);
579239344Snp	end = round_page((vm_offset_t)iov->iov_base + iov->iov_len);
580239344Snp	n = howmany(end - start, PAGE_SIZE);
581239344Snp
582239344Snp	if (end - start > MAX_DDP_BUFFER_SIZE)
583239344Snp		return (E2BIG);
584239344Snp
585239344Snp	pp = malloc(n * sizeof(vm_page_t), M_CXGBE, M_NOWAIT);
586239344Snp	if (pp == NULL)
587239344Snp		return (ENOMEM);
588239344Snp
589239344Snp	if (vm_fault_quick_hold_pages(map, (vm_offset_t)iov->iov_base,
590239344Snp	    iov->iov_len, VM_PROT_WRITE, pp, n) < 0) {
591239344Snp		free(pp, M_CXGBE);
592239344Snp		return (EFAULT);
593239344Snp	}
594239344Snp
595239344Snp	*ppages = pp;
596239344Snp	*pnpages = n;
597239344Snp
598239344Snp	return (0);
599239344Snp}
600239344Snp
601239344Snpstatic int
602239344Snpbufcmp(struct ddp_buffer *db, vm_page_t *pages, int npages, int offset, int len)
603239344Snp{
604239344Snp	int i;
605239344Snp
606239344Snp	if (db == NULL || db->npages != npages || db->offset != offset ||
607239344Snp	    db->len != len)
608239344Snp		return (1);
609239344Snp
610239344Snp	for (i = 0; i < npages; i++) {
611239344Snp		if (pages[i]->phys_addr != db->pages[i]->phys_addr)
612239344Snp			return (1);
613239344Snp	}
614239344Snp
615239344Snp	return (0);
616239344Snp}
617239344Snp
618239344Snpstatic int
619239344Snpcalculate_hcf(int n1, int n2)
620239344Snp{
621239344Snp	int a, b, t;
622239344Snp
623239344Snp	if (n1 <= n2) {
624239344Snp		a = n1;
625239344Snp		b = n2;
626239344Snp	} else {
627239344Snp		a = n2;
628239344Snp		b = n1;
629239344Snp	}
630239344Snp
631239344Snp	while (a != 0) {
632239344Snp		t = a;
633239344Snp		a = b % a;
634239344Snp		b = t;
635239344Snp	}
636239344Snp
637239344Snp	return (b);
638239344Snp}
639239344Snp
640309556Sjhbstatic inline int
641309556Sjhbpages_to_nppods(int npages, int ddp_page_shift)
642239344Snp{
643239344Snp
644309556Sjhb	MPASS(ddp_page_shift >= PAGE_SHIFT);
645309556Sjhb
646309556Sjhb	return (howmany(npages >> (ddp_page_shift - PAGE_SHIFT), PPOD_PAGES));
647309556Sjhb}
648309556Sjhb
649309556Sjhbstatic int
650309556Sjhballoc_page_pods(struct ppod_region *pr, u_int nppods, u_int pgsz_idx,
651309556Sjhb    struct ppod_reservation *prsv)
652309556Sjhb{
653309556Sjhb	vmem_addr_t addr;       /* relative to start of region */
654309556Sjhb
655309556Sjhb	if (vmem_alloc(pr->pr_arena, PPOD_SZ(nppods), M_NOWAIT | M_FIRSTFIT,
656309556Sjhb	    &addr) != 0)
657309556Sjhb		return (ENOMEM);
658309556Sjhb
659309556Sjhb	CTR5(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d, pgsz %d",
660309556Sjhb	    __func__, pr->pr_arena, (uint32_t)addr & pr->pr_tag_mask,
661309556Sjhb	    nppods, 1 << pr->pr_page_shift[pgsz_idx]);
662309556Sjhb
663239344Snp	/*
664309556Sjhb	 * The hardware tagmask includes an extra invalid bit but the arena was
665309556Sjhb	 * seeded with valid values only.  An allocation out of this arena will
666309556Sjhb	 * fit inside the tagmask but won't have the invalid bit set.
667309556Sjhb	 */
668309556Sjhb	MPASS((addr & pr->pr_tag_mask) == addr);
669309556Sjhb	MPASS((addr & pr->pr_invalid_bit) == 0);
670309556Sjhb
671309556Sjhb	prsv->prsv_pr = pr;
672309556Sjhb	prsv->prsv_tag = V_PPOD_PGSZ(pgsz_idx) | addr;
673309556Sjhb	prsv->prsv_nppods = nppods;
674309556Sjhb
675309556Sjhb	return (0);
676309556Sjhb}
677309556Sjhb
678309556Sjhbint
679309556Sjhbt4_alloc_page_pods_for_db(struct ppod_region *pr, struct ddp_buffer *db)
680309556Sjhb{
681309556Sjhb	int i, hcf, seglen, idx, nppods;
682309556Sjhb	struct ppod_reservation *prsv = &db->prsv;
683309556Sjhb
684309556Sjhb	KASSERT(prsv->prsv_nppods == 0,
685309556Sjhb	    ("%s: page pods already allocated", __func__));
686309556Sjhb
687309556Sjhb	/*
688239344Snp	 * The DDP page size is unrelated to the VM page size.  We combine
689239344Snp	 * contiguous physical pages into larger segments to get the best DDP
690239344Snp	 * page size possible.  This is the largest of the four sizes in
691239344Snp	 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in
692239344Snp	 * the page list.
693239344Snp	 */
694239344Snp	hcf = 0;
695309556Sjhb	for (i = 0; i < db->npages; i++) {
696239344Snp		seglen = PAGE_SIZE;
697309556Sjhb		while (i < db->npages - 1 &&
698309556Sjhb		    db->pages[i]->phys_addr + PAGE_SIZE ==
699309556Sjhb		    db->pages[i + 1]->phys_addr) {
700239344Snp			seglen += PAGE_SIZE;
701239344Snp			i++;
702239344Snp		}
703239344Snp
704239344Snp		hcf = calculate_hcf(hcf, seglen);
705309556Sjhb		if (hcf < (1 << pr->pr_page_shift[1])) {
706239344Snp			idx = 0;
707239344Snp			goto have_pgsz;	/* give up, short circuit */
708239344Snp		}
709239344Snp	}
710239344Snp
711309556Sjhb#define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
712309556Sjhb	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
713309556Sjhb	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
714309556Sjhb		if ((hcf & PR_PAGE_MASK(idx)) == 0)
715309556Sjhb			break;
716239344Snp	}
717309556Sjhb#undef PR_PAGE_MASK
718239344Snp
719239344Snphave_pgsz:
720281245Snp	MPASS(idx <= M_PPOD_PGSZ);
721239344Snp
722309556Sjhb	nppods = pages_to_nppods(db->npages, pr->pr_page_shift[idx]);
723309556Sjhb	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
724309556Sjhb		return (0);
725309556Sjhb	MPASS(prsv->prsv_nppods > 0);
726309556Sjhb
727309556Sjhb	return (1);
728309556Sjhb}
729309556Sjhb
730309556Sjhbint
731309556Sjhbt4_alloc_page_pods_for_buf(struct ppod_region *pr, vm_offset_t buf, int len,
732309556Sjhb    struct ppod_reservation *prsv)
733309556Sjhb{
734309556Sjhb	int hcf, seglen, idx, npages, nppods;
735309556Sjhb	uintptr_t start_pva, end_pva, pva, p1;
736309556Sjhb
737309556Sjhb	MPASS(buf > 0);
738309556Sjhb	MPASS(len > 0);
739309556Sjhb
740309556Sjhb	/*
741309556Sjhb	 * The DDP page size is unrelated to the VM page size.  We combine
742309556Sjhb	 * contiguous physical pages into larger segments to get the best DDP
743309556Sjhb	 * page size possible.  This is the largest of the four sizes in
744309556Sjhb	 * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes
745309556Sjhb	 * in the page list.
746309556Sjhb	 */
747309556Sjhb	hcf = 0;
748309556Sjhb	start_pva = trunc_page(buf);
749309556Sjhb	end_pva = trunc_page(buf + len - 1);
750309556Sjhb	pva = start_pva;
751309556Sjhb	while (pva <= end_pva) {
752309556Sjhb		seglen = PAGE_SIZE;
753309556Sjhb		p1 = pmap_kextract(pva);
754309556Sjhb		pva += PAGE_SIZE;
755309556Sjhb		while (pva <= end_pva && p1 + seglen == pmap_kextract(pva)) {
756309556Sjhb			seglen += PAGE_SIZE;
757309556Sjhb			pva += PAGE_SIZE;
758309556Sjhb		}
759309556Sjhb
760309556Sjhb		hcf = calculate_hcf(hcf, seglen);
761309556Sjhb		if (hcf < (1 << pr->pr_page_shift[1])) {
762309556Sjhb			idx = 0;
763309556Sjhb			goto have_pgsz;	/* give up, short circuit */
764309556Sjhb		}
765239344Snp	}
766239344Snp
767309556Sjhb#define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
768309556Sjhb	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
769309556Sjhb	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
770309556Sjhb		if ((hcf & PR_PAGE_MASK(idx)) == 0)
771309556Sjhb			break;
772239344Snp	}
773309556Sjhb#undef PR_PAGE_MASK
774239344Snp
775309556Sjhbhave_pgsz:
776309556Sjhb	MPASS(idx <= M_PPOD_PGSZ);
777239344Snp
778309556Sjhb	npages = 1;
779309556Sjhb	npages += (end_pva - start_pva) >> pr->pr_page_shift[idx];
780309556Sjhb	nppods = howmany(npages, PPOD_PAGES);
781309556Sjhb	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
782309556Sjhb		return (ENOMEM);
783309556Sjhb	MPASS(prsv->prsv_nppods > 0);
784239344Snp
785309556Sjhb	return (0);
786239344Snp}
787239344Snp
788309556Sjhbvoid
789309556Sjhbt4_free_page_pods(struct ppod_reservation *prsv)
790309556Sjhb{
791309556Sjhb	struct ppod_region *pr = prsv->prsv_pr;
792309556Sjhb	vmem_addr_t addr;
793309556Sjhb
794309556Sjhb	MPASS(prsv != NULL);
795309556Sjhb	MPASS(prsv->prsv_nppods != 0);
796309556Sjhb
797309556Sjhb	addr = prsv->prsv_tag & pr->pr_tag_mask;
798309556Sjhb	MPASS((addr & pr->pr_invalid_bit) == 0);
799309556Sjhb
800309556Sjhb	CTR4(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d", __func__,
801309556Sjhb	    pr->pr_arena, addr, prsv->prsv_nppods);
802309556Sjhb
803309556Sjhb	vmem_free(pr->pr_arena, addr, PPOD_SZ(prsv->prsv_nppods));
804309556Sjhb	prsv->prsv_nppods = 0;
805309556Sjhb}
806309556Sjhb
807239344Snp#define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE)
808239344Snp
809309556Sjhbint
810309556Sjhbt4_write_page_pods_for_db(struct adapter *sc, struct sge_wrq *wrq, int tid,
811309556Sjhb    struct ddp_buffer *db)
812239344Snp{
813239344Snp	struct wrqe *wr;
814239344Snp	struct ulp_mem_io *ulpmc;
815239344Snp	struct ulptx_idata *ulpsc;
816239344Snp	struct pagepod *ppod;
817281245Snp	int i, j, k, n, chunk, len, ddp_pgsz, idx;
818281245Snp	u_int ppod_addr;
819250117Snp	uint32_t cmd;
820309556Sjhb	struct ppod_reservation *prsv = &db->prsv;
821309556Sjhb	struct ppod_region *pr = prsv->prsv_pr;
822239344Snp
823309556Sjhb	MPASS(prsv->prsv_nppods > 0);
824309556Sjhb
825250117Snp	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
826250117Snp	if (is_t4(sc))
827250117Snp		cmd |= htobe32(F_ULP_MEMIO_ORDER);
828250117Snp	else
829250117Snp		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
830309556Sjhb	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
831309556Sjhb	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
832309556Sjhb	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
833239344Snp
834239344Snp		/* How many page pods are we writing in this cycle */
835309556Sjhb		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
836239344Snp		chunk = PPOD_SZ(n);
837248925Snp		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
838239344Snp
839309556Sjhb		wr = alloc_wrqe(len, wrq);
840239344Snp		if (wr == NULL)
841239344Snp			return (ENOMEM);	/* ok to just bail out */
842239344Snp		ulpmc = wrtod(wr);
843239344Snp
844239344Snp		INIT_ULPTX_WR(ulpmc, len, 0, 0);
845250117Snp		ulpmc->cmd = cmd;
846239344Snp		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
847239344Snp		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
848239344Snp		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
849239344Snp
850239344Snp		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
851239344Snp		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
852239344Snp		ulpsc->len = htobe32(chunk);
853239344Snp
854239344Snp		ppod = (struct pagepod *)(ulpsc + 1);
855239344Snp		for (j = 0; j < n; i++, j++, ppod++) {
856239344Snp			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
857309556Sjhb			    V_PPOD_TID(tid) | prsv->prsv_tag);
858239344Snp			ppod->len_offset = htobe64(V_PPOD_LEN(db->len) |
859239344Snp			    V_PPOD_OFST(db->offset));
860239344Snp			ppod->rsvd = 0;
861239344Snp			idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
862240452Snp			for (k = 0; k < nitems(ppod->addr); k++) {
863239344Snp				if (idx < db->npages) {
864239344Snp					ppod->addr[k] =
865239344Snp					    htobe64(db->pages[idx]->phys_addr);
866239344Snp					idx += ddp_pgsz / PAGE_SIZE;
867239344Snp				} else
868239344Snp					ppod->addr[k] = 0;
869239344Snp#if 0
870239344Snp				CTR5(KTR_CXGBE,
871239344Snp				    "%s: tid %d ppod[%d]->addr[%d] = %p",
872239344Snp				    __func__, toep->tid, i, k,
873239344Snp				    htobe64(ppod->addr[k]));
874239344Snp#endif
875239344Snp			}
876239344Snp
877239344Snp		}
878239344Snp
879239344Snp		t4_wrq_tx(sc, wr);
880239344Snp	}
881239344Snp
882239344Snp	return (0);
883239344Snp}
884239344Snp
885309556Sjhbint
886309556Sjhbt4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid,
887309556Sjhb    struct ppod_reservation *prsv, vm_offset_t buf, int buflen)
888309556Sjhb{
889309556Sjhb	struct wrqe *wr;
890309556Sjhb	struct ulp_mem_io *ulpmc;
891309556Sjhb	struct ulptx_idata *ulpsc;
892309556Sjhb	struct pagepod *ppod;
893309556Sjhb	int i, j, k, n, chunk, len, ddp_pgsz;
894309556Sjhb	u_int ppod_addr, offset;
895309556Sjhb	uint32_t cmd;
896309556Sjhb	struct ppod_region *pr = prsv->prsv_pr;
897309556Sjhb	uintptr_t end_pva, pva, pa;
898309556Sjhb
899309556Sjhb	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
900309556Sjhb	if (is_t4(sc))
901309556Sjhb		cmd |= htobe32(F_ULP_MEMIO_ORDER);
902309556Sjhb	else
903309556Sjhb		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
904309556Sjhb	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
905309556Sjhb	offset = buf & PAGE_MASK;
906309556Sjhb	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
907309556Sjhb	pva = trunc_page(buf);
908309556Sjhb	end_pva = trunc_page(buf + buflen - 1);
909309556Sjhb	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
910309556Sjhb
911309556Sjhb		/* How many page pods are we writing in this cycle */
912309556Sjhb		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
913309556Sjhb		MPASS(n > 0);
914309556Sjhb		chunk = PPOD_SZ(n);
915309556Sjhb		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
916309556Sjhb
917309556Sjhb		wr = alloc_wrqe(len, wrq);
918309556Sjhb		if (wr == NULL)
919309556Sjhb			return (ENOMEM);	/* ok to just bail out */
920309556Sjhb		ulpmc = wrtod(wr);
921309556Sjhb
922309556Sjhb		INIT_ULPTX_WR(ulpmc, len, 0, 0);
923309556Sjhb		ulpmc->cmd = cmd;
924309556Sjhb		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
925309556Sjhb		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
926309556Sjhb		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
927309556Sjhb
928309556Sjhb		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
929309556Sjhb		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
930309556Sjhb		ulpsc->len = htobe32(chunk);
931309556Sjhb
932309556Sjhb		ppod = (struct pagepod *)(ulpsc + 1);
933309556Sjhb		for (j = 0; j < n; i++, j++, ppod++) {
934309556Sjhb			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
935309556Sjhb			    V_PPOD_TID(tid) |
936309556Sjhb			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
937309556Sjhb			ppod->len_offset = htobe64(V_PPOD_LEN(buflen) |
938309556Sjhb			    V_PPOD_OFST(offset));
939309556Sjhb			ppod->rsvd = 0;
940309556Sjhb
941309556Sjhb			for (k = 0; k < nitems(ppod->addr); k++) {
942309556Sjhb				if (pva > end_pva)
943309556Sjhb					ppod->addr[k] = 0;
944309556Sjhb				else {
945309556Sjhb					pa = pmap_kextract(pva);
946309556Sjhb					ppod->addr[k] = htobe64(pa);
947309556Sjhb					pva += ddp_pgsz;
948309556Sjhb				}
949309556Sjhb#if 0
950309556Sjhb				CTR5(KTR_CXGBE,
951309556Sjhb				    "%s: tid %d ppod[%d]->addr[%d] = %p",
952309556Sjhb				    __func__, tid, i, k,
953309556Sjhb				    htobe64(ppod->addr[k]));
954309556Sjhb#endif
955309556Sjhb			}
956309556Sjhb
957309556Sjhb			/*
958309556Sjhb			 * Walk back 1 segment so that the first address in the
959309556Sjhb			 * next pod is the same as the last one in the current
960309556Sjhb			 * pod.
961309556Sjhb			 */
962309556Sjhb			pva -= ddp_pgsz;
963309556Sjhb		}
964309556Sjhb
965309556Sjhb		t4_wrq_tx(sc, wr);
966309556Sjhb	}
967309556Sjhb
968309556Sjhb	MPASS(pva <= end_pva);
969309556Sjhb
970309556Sjhb	return (0);
971309556Sjhb}
972309556Sjhb
973239344Snp/*
974239527Snp * Reuse, or allocate (and program the page pods for) a new DDP buffer.  The
975239527Snp * "pages" array is handed over to this function and should not be used in any
976239527Snp * way by the caller after that.
977239344Snp */
978239344Snpstatic int
979239344Snpselect_ddp_buffer(struct adapter *sc, struct toepcb *toep, vm_page_t *pages,
980239344Snp    int npages, int db_off, int db_len)
981239344Snp{
982239344Snp	struct ddp_buffer *db;
983239344Snp	struct tom_data *td = sc->tom_softc;
984239344Snp	int i, empty_slot = -1;
985239344Snp
986239344Snp	/* Try to reuse */
987240452Snp	for (i = 0; i < nitems(toep->db); i++) {
988239344Snp		if (bufcmp(toep->db[i], pages, npages, db_off, db_len) == 0) {
989239344Snp			free(pages, M_CXGBE);
990239344Snp			return (i);	/* pages still held */
991239344Snp		} else if (toep->db[i] == NULL && empty_slot < 0)
992239344Snp			empty_slot = i;
993239344Snp	}
994239344Snp
995239344Snp	/* Allocate new buffer, write its page pods. */
996309556Sjhb	db = alloc_ddp_buffer(pages, npages, db_off, db_len);
997239344Snp	if (db == NULL) {
998239344Snp		vm_page_unhold_pages(pages, npages);
999239344Snp		free(pages, M_CXGBE);
1000239344Snp		return (-1);
1001239344Snp	}
1002309556Sjhb	if (t4_alloc_page_pods_for_db(&td->pr, db)) {
1003239344Snp		vm_page_unhold_pages(pages, npages);
1004309556Sjhb		free_ddp_buffer(db);
1005239344Snp		return (-1);
1006239344Snp	}
1007309556Sjhb	if (t4_write_page_pods_for_db(sc, toep->ctrlq, toep->tid, db) != 0) {
1008309556Sjhb		vm_page_unhold_pages(pages, npages);
1009309556Sjhb		free_ddp_buffer(db);
1010309556Sjhb		return (-1);
1011309556Sjhb	}
1012239344Snp
1013239344Snp	i = empty_slot;
1014239344Snp	if (i < 0) {
1015240452Snp		i = arc4random() % nitems(toep->db);
1016309556Sjhb		free_ddp_buffer(toep->db[i]);
1017239344Snp	}
1018239344Snp	toep->db[i] = db;
1019239344Snp
1020239344Snp	CTR5(KTR_CXGBE, "%s: tid %d, DDP buffer[%d] = %p (tag 0x%x)",
1021309556Sjhb	    __func__, toep->tid, i, db, db->prsv.prsv_tag);
1022239344Snp
1023239344Snp	return (i);
1024239344Snp}
1025239344Snp
1026239344Snpstatic void
1027239344Snpwire_ddp_buffer(struct ddp_buffer *db)
1028239344Snp{
1029239344Snp	int i;
1030239344Snp	vm_page_t p;
1031239344Snp
1032239344Snp	for (i = 0; i < db->npages; i++) {
1033239344Snp		p = db->pages[i];
1034239344Snp		vm_page_lock(p);
1035239344Snp		vm_page_wire(p);
1036239344Snp		vm_page_unhold(p);
1037239344Snp		vm_page_unlock(p);
1038239344Snp	}
1039239344Snp}
1040239344Snp
1041239344Snpstatic void
1042239344Snpunwire_ddp_buffer(struct ddp_buffer *db)
1043239344Snp{
1044239344Snp	int i;
1045239344Snp	vm_page_t p;
1046239344Snp
1047239344Snp	for (i = 0; i < db->npages; i++) {
1048239344Snp		p = db->pages[i];
1049239344Snp		vm_page_lock(p);
1050239344Snp		vm_page_unwire(p, 0);
1051239344Snp		vm_page_unlock(p);
1052239344Snp	}
1053239344Snp}
1054239344Snp
1055239344Snpstatic int
1056239344Snphandle_ddp(struct socket *so, struct uio *uio, int flags, int error)
1057239344Snp{
1058239344Snp	struct sockbuf *sb = &so->so_rcv;
1059239344Snp	struct tcpcb *tp = so_sototcpcb(so);
1060239344Snp	struct toepcb *toep = tp->t_toe;
1061239344Snp	struct adapter *sc = td_adapter(toep->td);
1062239344Snp	vm_page_t *pages;
1063239344Snp	int npages, db_idx, rc, buf_flag;
1064239344Snp	struct ddp_buffer *db;
1065239344Snp	struct wrqe *wr;
1066239344Snp	uint64_t ddp_flags;
1067239344Snp
1068239344Snp	SOCKBUF_LOCK_ASSERT(sb);
1069239344Snp
1070239344Snp#if 0
1071239344Snp	if (sb->sb_cc + sc->tt.ddp_thres > uio->uio_resid) {
1072239344Snp		CTR4(KTR_CXGBE, "%s: sb_cc %d, threshold %d, resid %d",
1073239344Snp		    __func__, sb->sb_cc, sc->tt.ddp_thres, uio->uio_resid);
1074239344Snp	}
1075239344Snp#endif
1076239344Snp
1077239344Snp	/* XXX: too eager to disable DDP, could handle NBIO better than this. */
1078239344Snp	if (sb->sb_cc >= uio->uio_resid || uio->uio_resid < sc->tt.ddp_thres ||
1079239344Snp	    uio->uio_resid > MAX_DDP_BUFFER_SIZE || uio->uio_iovcnt > 1 ||
1080239344Snp	    so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO) ||
1081239344Snp	    error || so->so_error || sb->sb_state & SBS_CANTRCVMORE)
1082239344Snp		goto no_ddp;
1083239344Snp
1084239344Snp	/*
1085239344Snp	 * Fault in and then hold the pages of the uio buffers.  We'll wire them
1086239344Snp	 * a bit later if everything else works out.
1087239344Snp	 */
1088239527Snp	SOCKBUF_UNLOCK(sb);
1089239527Snp	if (hold_uio(uio, &pages, &npages) != 0) {
1090239527Snp		SOCKBUF_LOCK(sb);
1091239344Snp		goto no_ddp;
1092239527Snp	}
1093239527Snp	SOCKBUF_LOCK(sb);
1094239527Snp	if (__predict_false(so->so_error || sb->sb_state & SBS_CANTRCVMORE)) {
1095239527Snp		vm_page_unhold_pages(pages, npages);
1096239527Snp		free(pages, M_CXGBE);
1097239527Snp		goto no_ddp;
1098239527Snp	}
1099239344Snp
1100239344Snp	/*
1101239344Snp	 * Figure out which one of the two DDP buffers to use this time.
1102239344Snp	 */
1103239344Snp	db_idx = select_ddp_buffer(sc, toep, pages, npages,
1104239344Snp	    (uintptr_t)uio->uio_iov->iov_base & PAGE_MASK, uio->uio_resid);
1105239527Snp	pages = NULL;	/* handed off to select_ddp_buffer */
1106239344Snp	if (db_idx < 0)
1107239344Snp		goto no_ddp;
1108239344Snp	db = toep->db[db_idx];
1109239344Snp	buf_flag = db_idx == 0 ? DDP_BUF0_ACTIVE : DDP_BUF1_ACTIVE;
1110239344Snp
1111239344Snp	/*
1112239344Snp	 * Build the compound work request that tells the chip where to DMA the
1113239344Snp	 * payload.
1114239344Snp	 */
1115239344Snp	ddp_flags = select_ddp_flags(so, flags, db_idx);
1116239344Snp	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, sb->sb_cc, ddp_flags);
1117239344Snp	if (wr == NULL) {
1118239527Snp		/*
1119239527Snp		 * Just unhold the pages.  The DDP buffer's software state is
1120239527Snp		 * left as-is in the toep.  The page pods were written
1121239527Snp		 * successfully and we may have an opportunity to use it in the
1122239527Snp		 * future.
1123239527Snp		 */
1124239527Snp		vm_page_unhold_pages(db->pages, db->npages);
1125239344Snp		goto no_ddp;
1126239344Snp	}
1127239344Snp
1128239527Snp	/* Wire (and then unhold) the pages, and give the chip the go-ahead. */
1129239344Snp	wire_ddp_buffer(db);
1130239344Snp	t4_wrq_tx(sc, wr);
1131239344Snp	sb->sb_flags &= ~SB_DDP_INDICATE;
1132239344Snp	toep->ddp_flags |= buf_flag;
1133239344Snp
1134239344Snp	/*
1135239344Snp	 * Wait for the DDP operation to complete and then unwire the pages.
1136239344Snp	 * The return code from the sbwait will be the final return code of this
1137239344Snp	 * function.  But we do need to wait for DDP no matter what.
1138239344Snp	 */
1139239344Snp	rc = sbwait(sb);
1140239344Snp	while (toep->ddp_flags & buf_flag) {
1141239344Snp		sb->sb_flags |= SB_WAIT;
1142239344Snp		msleep(&sb->sb_cc, &sb->sb_mtx, PSOCK , "sbwait", 0);
1143239344Snp	}
1144239344Snp	unwire_ddp_buffer(db);
1145239344Snp	return (rc);
1146239344Snpno_ddp:
1147239344Snp	disable_ddp(sc, toep);
1148239344Snp	discourage_ddp(toep);
1149239344Snp	sb->sb_flags &= ~SB_DDP_INDICATE;
1150239344Snp	return (0);
1151239344Snp}
1152239344Snp
1153309556Sjhbint
1154309556Sjhbt4_init_ppod_region(struct ppod_region *pr, struct t4_range *r, u_int psz,
1155309556Sjhb    const char *name)
1156239344Snp{
1157309556Sjhb	int i;
1158239344Snp
1159309556Sjhb	MPASS(pr != NULL);
1160309556Sjhb	MPASS(r->size > 0);
1161309556Sjhb
1162309556Sjhb	pr->pr_start = r->start;
1163309556Sjhb	pr->pr_len = r->size;
1164309556Sjhb	pr->pr_page_shift[0] = 12 + G_HPZ0(psz);
1165309556Sjhb	pr->pr_page_shift[1] = 12 + G_HPZ1(psz);
1166309556Sjhb	pr->pr_page_shift[2] = 12 + G_HPZ2(psz);
1167309556Sjhb	pr->pr_page_shift[3] = 12 + G_HPZ3(psz);
1168309556Sjhb
1169309556Sjhb	/* The SGL -> page pod algorithm requires the sizes to be in order. */
1170309556Sjhb	for (i = 1; i < nitems(pr->pr_page_shift); i++) {
1171309556Sjhb		if (pr->pr_page_shift[i] <= pr->pr_page_shift[i - 1])
1172309556Sjhb			return (ENXIO);
1173309556Sjhb	}
1174309556Sjhb
1175309556Sjhb	pr->pr_tag_mask = ((1 << fls(r->size)) - 1) & V_PPOD_TAG(M_PPOD_TAG);
1176309556Sjhb	pr->pr_alias_mask = V_PPOD_TAG(M_PPOD_TAG) & ~pr->pr_tag_mask;
1177309556Sjhb	if (pr->pr_tag_mask == 0 || pr->pr_alias_mask == 0)
1178309556Sjhb		return (ENXIO);
1179309556Sjhb	pr->pr_alias_shift = fls(pr->pr_tag_mask);
1180309556Sjhb	pr->pr_invalid_bit = 1 << (pr->pr_alias_shift - 1);
1181309556Sjhb
1182309556Sjhb	pr->pr_arena = vmem_create(name, 0, pr->pr_len, PPOD_SIZE, 0,
1183309556Sjhb	    M_FIRSTFIT | M_NOWAIT);
1184309556Sjhb	if (pr->pr_arena == NULL)
1185309556Sjhb		return (ENOMEM);
1186309556Sjhb
1187309556Sjhb	return (0);
1188239344Snp}
1189239344Snp
1190239344Snpvoid
1191309556Sjhbt4_free_ppod_region(struct ppod_region *pr)
1192239344Snp{
1193239344Snp
1194309556Sjhb	MPASS(pr != NULL);
1195309556Sjhb
1196309556Sjhb	if (pr->pr_arena)
1197309556Sjhb		vmem_destroy(pr->pr_arena);
1198309556Sjhb	bzero(pr, sizeof(*pr));
1199239344Snp}
1200239344Snp
1201239344Snp#define	VNET_SO_ASSERT(so)						\
1202239344Snp	VNET_ASSERT(curvnet != NULL,					\
1203239344Snp	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
1204239344Snp#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1205239344Snpstatic int
1206239344Snpsoreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1207239344Snp{
1208239344Snp
1209239344Snp	CXGBE_UNIMPLEMENTED(__func__);
1210239344Snp}
1211239344Snp
1212250218Snpstatic char ddp_magic_str[] = "nothing to see here";
1213250218Snp
1214308281Sjhbstatic struct mbuf *
1215250218Snpget_ddp_mbuf(int len)
1216250218Snp{
1217250218Snp	struct mbuf *m;
1218250218Snp
1219250218Snp	m = m_get(M_NOWAIT, MT_DATA);
1220250218Snp	if (m == NULL)
1221250218Snp		CXGBE_UNIMPLEMENTED("mbuf alloc failure");
1222250218Snp	m->m_len = len;
1223250218Snp	m->m_data = &ddp_magic_str[0];
1224250218Snp
1225250218Snp	return (m);
1226250218Snp}
1227250218Snp
1228250218Snpstatic inline int
1229250218Snpis_ddp_mbuf(struct mbuf *m)
1230250218Snp{
1231250218Snp
1232250218Snp	return (m->m_data == &ddp_magic_str[0]);
1233250218Snp}
1234250218Snp
1235239344Snp/*
1236239344Snp * Copy an mbuf chain into a uio limited by len if set.
1237239344Snp */
1238239344Snpstatic int
1239239344Snpm_mbuftouio_ddp(struct uio *uio, struct mbuf *m, int len)
1240239344Snp{
1241239344Snp	int error, length, total;
1242239344Snp	int progress = 0;
1243239344Snp
1244239344Snp	if (len > 0)
1245239344Snp		total = min(uio->uio_resid, len);
1246239344Snp	else
1247239344Snp		total = uio->uio_resid;
1248239344Snp
1249239344Snp	/* Fill the uio with data from the mbufs. */
1250239344Snp	for (; m != NULL; m = m->m_next) {
1251239344Snp		length = min(m->m_len, total - progress);
1252239344Snp
1253250218Snp		if (is_ddp_mbuf(m)) {
1254239344Snp			enum uio_seg segflag = uio->uio_segflg;
1255239344Snp
1256239344Snp			uio->uio_segflg	= UIO_NOCOPY;
1257239344Snp			error = uiomove(mtod(m, void *), length, uio);
1258239344Snp			uio->uio_segflg	= segflag;
1259239344Snp		} else
1260239344Snp			error = uiomove(mtod(m, void *), length, uio);
1261239344Snp		if (error)
1262239344Snp			return (error);
1263239344Snp
1264239344Snp		progress += length;
1265239344Snp	}
1266239344Snp
1267239344Snp	return (0);
1268239344Snp}
1269239344Snp
1270239344Snp/*
1271239344Snp * Based on soreceive_stream() in uipc_socket.c
1272239344Snp */
1273239344Snpint
1274239344Snpt4_soreceive_ddp(struct socket *so, struct sockaddr **psa, struct uio *uio,
1275239344Snp    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1276239344Snp{
1277239344Snp	int len = 0, error = 0, flags, oresid, ddp_handled = 0;
1278239344Snp	struct sockbuf *sb;
1279239344Snp	struct mbuf *m, *n = NULL;
1280239344Snp
1281239344Snp	/* We only do stream sockets. */
1282239344Snp	if (so->so_type != SOCK_STREAM)
1283239344Snp		return (EINVAL);
1284239344Snp	if (psa != NULL)
1285239344Snp		*psa = NULL;
1286239344Snp	if (controlp != NULL)
1287239344Snp		return (EINVAL);
1288239344Snp	if (flagsp != NULL)
1289239344Snp		flags = *flagsp &~ MSG_EOR;
1290239344Snp	else
1291239344Snp		flags = 0;
1292239344Snp	if (flags & MSG_OOB)
1293239344Snp		return (soreceive_rcvoob(so, uio, flags));
1294239344Snp	if (mp0 != NULL)
1295239344Snp		*mp0 = NULL;
1296239344Snp
1297239344Snp	sb = &so->so_rcv;
1298239344Snp
1299239344Snp	/* Prevent other readers from entering the socket. */
1300239344Snp	error = sblock(sb, SBLOCKWAIT(flags));
1301308281Sjhb	SOCKBUF_LOCK(sb);
1302239344Snp	if (error)
1303239344Snp		goto out;
1304239344Snp
1305239344Snp	/* Easy one, no space to copyout anything. */
1306239344Snp	if (uio->uio_resid == 0) {
1307239344Snp		error = EINVAL;
1308239344Snp		goto out;
1309239344Snp	}
1310239344Snp	oresid = uio->uio_resid;
1311239344Snp
1312239344Snp	/* We will never ever get anything unless we are or were connected. */
1313239344Snp	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1314239344Snp		error = ENOTCONN;
1315239344Snp		goto out;
1316239344Snp	}
1317239344Snp
1318239344Snprestart:
1319239344Snp	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1320239344Snp
1321239344Snp	if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) {
1322239344Snp
1323239344Snp		/* uio should be just as it was at entry */
1324239344Snp		KASSERT(oresid == uio->uio_resid,
1325239344Snp		    ("%s: oresid = %d, uio_resid = %zd, sb_cc = %d",
1326239344Snp		    __func__, oresid, uio->uio_resid, sb->sb_cc));
1327239344Snp
1328239344Snp		error = handle_ddp(so, uio, flags, 0);
1329239344Snp		ddp_handled = 1;
1330239344Snp		if (error)
1331239344Snp			goto out;
1332239344Snp	}
1333239344Snp
1334239344Snp	/* Abort if socket has reported problems. */
1335239344Snp	if (so->so_error) {
1336239344Snp		if (sb->sb_cc > 0)
1337239344Snp			goto deliver;
1338239344Snp		if (oresid > uio->uio_resid)
1339239344Snp			goto out;
1340239344Snp		error = so->so_error;
1341239344Snp		if (!(flags & MSG_PEEK))
1342239344Snp			so->so_error = 0;
1343239344Snp		goto out;
1344239344Snp	}
1345239344Snp
1346239344Snp	/* Door is closed.  Deliver what is left, if any. */
1347239344Snp	if (sb->sb_state & SBS_CANTRCVMORE) {
1348239344Snp		if (sb->sb_cc > 0)
1349239344Snp			goto deliver;
1350239344Snp		else
1351239344Snp			goto out;
1352239344Snp	}
1353239344Snp
1354239344Snp	/* Socket buffer is empty and we shall not block. */
1355239344Snp	if (sb->sb_cc == 0 &&
1356239344Snp	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1357239344Snp		error = EAGAIN;
1358239344Snp		goto out;
1359239344Snp	}
1360239344Snp
1361239344Snp	/* Socket buffer got some data that we shall deliver now. */
1362239344Snp	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
1363278319Sjhb	    ((so->so_state & SS_NBIO) ||
1364239344Snp	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1365239344Snp	     sb->sb_cc >= sb->sb_lowat ||
1366239344Snp	     sb->sb_cc >= uio->uio_resid ||
1367239344Snp	     sb->sb_cc >= sb->sb_hiwat) ) {
1368239344Snp		goto deliver;
1369239344Snp	}
1370239344Snp
1371239344Snp	/* On MSG_WAITALL we must wait until all data or error arrives. */
1372239344Snp	if ((flags & MSG_WAITALL) &&
1373239344Snp	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
1374239344Snp		goto deliver;
1375239344Snp
1376239344Snp	/*
1377239344Snp	 * Wait and block until (more) data comes in.
1378239344Snp	 * NB: Drops the sockbuf lock during wait.
1379239344Snp	 */
1380239344Snp	error = sbwait(sb);
1381239344Snp	if (error) {
1382239344Snp		if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) {
1383239344Snp			(void) handle_ddp(so, uio, flags, 1);
1384239344Snp			ddp_handled = 1;
1385239344Snp		}
1386239344Snp		goto out;
1387239344Snp	}
1388239344Snp	goto restart;
1389239344Snp
1390239344Snpdeliver:
1391239344Snp	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1392239344Snp	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
1393239344Snp	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1394239344Snp
1395239344Snp	if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled)
1396239344Snp		goto restart;
1397239344Snp
1398239344Snp	/* Statistics. */
1399239344Snp	if (uio->uio_td)
1400239344Snp		uio->uio_td->td_ru.ru_msgrcv++;
1401239344Snp
1402239344Snp	/* Fill uio until full or current end of socket buffer is reached. */
1403239344Snp	len = min(uio->uio_resid, sb->sb_cc);
1404239344Snp	if (mp0 != NULL) {
1405239344Snp		/* Dequeue as many mbufs as possible. */
1406239344Snp		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1407239344Snp			for (*mp0 = m = sb->sb_mb;
1408239344Snp			     m != NULL && m->m_len <= len;
1409239344Snp			     m = m->m_next) {
1410239344Snp				len -= m->m_len;
1411239344Snp				uio->uio_resid -= m->m_len;
1412239344Snp				sbfree(sb, m);
1413239344Snp				n = m;
1414239344Snp			}
1415239344Snp			sb->sb_mb = m;
1416239344Snp			if (sb->sb_mb == NULL)
1417239344Snp				SB_EMPTY_FIXUP(sb);
1418239344Snp			n->m_next = NULL;
1419239344Snp		}
1420239344Snp		/* Copy the remainder. */
1421239344Snp		if (len > 0) {
1422239344Snp			KASSERT(sb->sb_mb != NULL,
1423239344Snp			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1424239344Snp
1425243857Sglebius			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1426239344Snp			if (m == NULL)
1427239344Snp				len = 0;	/* Don't flush data from sockbuf. */
1428239344Snp			else
1429239344Snp				uio->uio_resid -= m->m_len;
1430239344Snp			if (*mp0 != NULL)
1431239344Snp				n->m_next = m;
1432239344Snp			else
1433239344Snp				*mp0 = m;
1434239344Snp			if (*mp0 == NULL) {
1435239344Snp				error = ENOBUFS;
1436239344Snp				goto out;
1437239344Snp			}
1438239344Snp		}
1439239344Snp	} else {
1440239344Snp		/* NB: Must unlock socket buffer as uiomove may sleep. */
1441239344Snp		SOCKBUF_UNLOCK(sb);
1442239344Snp		error = m_mbuftouio_ddp(uio, sb->sb_mb, len);
1443239344Snp		SOCKBUF_LOCK(sb);
1444239344Snp		if (error)
1445239344Snp			goto out;
1446239344Snp	}
1447239344Snp	SBLASTRECORDCHK(sb);
1448239344Snp	SBLASTMBUFCHK(sb);
1449239344Snp
1450239344Snp	/*
1451239344Snp	 * Remove the delivered data from the socket buffer unless we
1452239344Snp	 * were only peeking.
1453239344Snp	 */
1454239344Snp	if (!(flags & MSG_PEEK)) {
1455239344Snp		if (len > 0)
1456239344Snp			sbdrop_locked(sb, len);
1457239344Snp
1458239344Snp		/* Notify protocol that we drained some data. */
1459239344Snp		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
1460239344Snp		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
1461239344Snp		     !(flags & MSG_SOCALLBCK))) {
1462239344Snp			SOCKBUF_UNLOCK(sb);
1463239344Snp			VNET_SO_ASSERT(so);
1464239344Snp			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
1465239344Snp			SOCKBUF_LOCK(sb);
1466239344Snp		}
1467239344Snp	}
1468239344Snp
1469239344Snp	/*
1470239344Snp	 * For MSG_WAITALL we may have to loop again and wait for
1471239344Snp	 * more data to come in.
1472239344Snp	 */
1473239344Snp	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1474239344Snp		goto restart;
1475239344Snpout:
1476239344Snp	SOCKBUF_LOCK_ASSERT(sb);
1477239344Snp	SBLASTRECORDCHK(sb);
1478239344Snp	SBLASTMBUFCHK(sb);
1479239344Snp	SOCKBUF_UNLOCK(sb);
1480239344Snp	sbunlock(sb);
1481239344Snp	return (error);
1482239344Snp}
1483239344Snp
1484309442Sjhbint
1485309442Sjhbt4_ddp_mod_load(void)
1486309442Sjhb{
1487309442Sjhb
1488309442Sjhb	t4_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
1489309442Sjhb	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
1490309442Sjhb	return (0);
1491309442Sjhb}
1492309442Sjhb
1493309442Sjhbvoid
1494309442Sjhbt4_ddp_mod_unload(void)
1495309442Sjhb{
1496309442Sjhb
1497309442Sjhb	t4_register_cpl_handler(CPL_RX_DATA_DDP, NULL);
1498309442Sjhb	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, NULL);
1499309442Sjhb}
1500239344Snp#endif
1501