1/*-
2 * Copyright (c) 2012 Chelsio Communications, Inc.
3 * All rights reserved.
4 * Written by: Navdeep Parhar <np@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD$");
30
31#include "opt_inet.h"
32
33#include <sys/param.h>
34#include <sys/types.h>
35#include <sys/systm.h>
36#include <sys/kernel.h>
37#include <sys/ktr.h>
38#include <sys/module.h>
39#include <sys/protosw.h>
40#include <sys/proc.h>
41#include <sys/domain.h>
42#include <sys/socket.h>
43#include <sys/socketvar.h>
44#include <sys/uio.h>
45#include <netinet/in.h>
46#include <netinet/in_pcb.h>
47#include <netinet/ip.h>
48#include <netinet/tcp_var.h>
49#define TCPSTATES
50#include <netinet/tcp_fsm.h>
51#include <netinet/toecore.h>
52
53#include <vm/vm.h>
54#include <vm/vm_extern.h>
55#include <vm/vm_param.h>
56#include <vm/pmap.h>
57#include <vm/vm_map.h>
58#include <vm/vm_page.h>
59#include <vm/vm_object.h>
60
61#ifdef TCP_OFFLOAD
62#include "common/common.h"
63#include "common/t4_msg.h"
64#include "common/t4_regs.h"
65#include "common/t4_tcb.h"
66#include "tom/t4_tom.h"
67
68#define PPOD_SZ(n)	((n) * sizeof(struct pagepod))
69#define PPOD_SIZE	(PPOD_SZ(1))
70
71/* XXX: must match A_ULP_RX_TDDP_PSZ */
72static int t4_ddp_pgsz[] = {4096, 4096 << 2, 4096 << 4, 4096 << 6};
73
74#if 0
75static void
76t4_dump_tcb(struct adapter *sc, int tid)
77{
78	uint32_t tcb_base, off, i, j;
79
80	/* Dump TCB for the tid */
81	tcb_base = t4_read_reg(sc, A_TP_CMM_TCB_BASE);
82	t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2),
83	    tcb_base + tid * TCB_SIZE);
84	t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2));
85	off = 0;
86	printf("\n");
87	for (i = 0; i < 4; i++) {
88		uint32_t buf[8];
89		for (j = 0; j < 8; j++, off += 4)
90			buf[j] = htonl(t4_read_reg(sc, MEMWIN2_BASE + off));
91
92		printf("%08x %08x %08x %08x %08x %08x %08x %08x\n",
93		    buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6],
94		    buf[7]);
95	}
96}
97#endif
98
99#define MAX_DDP_BUFFER_SIZE		(M_TCB_RX_DDP_BUF0_LEN)
100static int
101alloc_ppods(struct tom_data *td, int n, struct ppod_region *pr)
102{
103	int ppod;
104
105	KASSERT(n > 0, ("%s: nonsense allocation (%d)", __func__, n));
106
107	mtx_lock(&td->ppod_lock);
108	if (n > td->nppods_free) {
109		mtx_unlock(&td->ppod_lock);
110		return (-1);
111	}
112
113	if (td->nppods_free_head >= n) {
114		td->nppods_free_head -= n;
115		ppod = td->nppods_free_head;
116		TAILQ_INSERT_HEAD(&td->ppods, pr, link);
117	} else {
118		struct ppod_region *p;
119
120		ppod = td->nppods_free_head;
121		TAILQ_FOREACH(p, &td->ppods, link) {
122			ppod += p->used + p->free;
123			if (n <= p->free) {
124				ppod -= n;
125				p->free -= n;
126				TAILQ_INSERT_AFTER(&td->ppods, p, pr, link);
127				goto allocated;
128			}
129		}
130
131		if (__predict_false(ppod != td->nppods)) {
132			panic("%s: ppods TAILQ (%p) corrupt."
133			    "  At %d instead of %d at the end of the queue.",
134			    __func__, &td->ppods, ppod, td->nppods);
135		}
136
137		mtx_unlock(&td->ppod_lock);
138		return (-1);
139	}
140
141allocated:
142	pr->used = n;
143	pr->free = 0;
144	td->nppods_free -= n;
145	mtx_unlock(&td->ppod_lock);
146
147	return (ppod);
148}
149
150static void
151free_ppods(struct tom_data *td, struct ppod_region *pr)
152{
153	struct ppod_region *p;
154
155	KASSERT(pr->used > 0, ("%s: nonsense free (%d)", __func__, pr->used));
156
157	mtx_lock(&td->ppod_lock);
158	p = TAILQ_PREV(pr, ppod_head, link);
159	if (p != NULL)
160		p->free += pr->used + pr->free;
161	else
162		td->nppods_free_head += pr->used + pr->free;
163	td->nppods_free += pr->used;
164	KASSERT(td->nppods_free <= td->nppods,
165	    ("%s: nppods_free (%d) > nppods (%d).  %d freed this time.",
166	    __func__, td->nppods_free, td->nppods, pr->used));
167	TAILQ_REMOVE(&td->ppods, pr, link);
168	mtx_unlock(&td->ppod_lock);
169}
170
171static inline int
172pages_to_nppods(int npages, int ddp_pgsz)
173{
174	int nsegs = npages * PAGE_SIZE / ddp_pgsz;
175
176	return (howmany(nsegs, PPOD_PAGES));
177}
178
179static void
180free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db)
181{
182
183	if (db == NULL)
184		return;
185
186	if (db->pages)
187		free(db->pages, M_CXGBE);
188
189	if (db->nppods > 0)
190		free_ppods(td, &db->ppod_region);
191
192	free(db, M_CXGBE);
193}
194
195void
196release_ddp_resources(struct toepcb *toep)
197{
198	int i;
199
200	for (i = 0; i < nitems(toep->db); i++) {
201		if (toep->db[i] != NULL) {
202			free_ddp_buffer(toep->td, toep->db[i]);
203			toep->db[i] = NULL;
204		}
205	}
206}
207
208/* XXX: handle_ddp_data code duplication */
209void
210insert_ddp_data(struct toepcb *toep, uint32_t n)
211{
212	struct inpcb *inp = toep->inp;
213	struct tcpcb *tp = intotcpcb(inp);
214	struct sockbuf *sb = &inp->inp_socket->so_rcv;
215	struct mbuf *m;
216
217	INP_WLOCK_ASSERT(inp);
218	SOCKBUF_LOCK_ASSERT(sb);
219
220	m = get_ddp_mbuf(n);
221	tp->rcv_nxt += n;
222#ifndef USE_DDP_RX_FLOW_CONTROL
223	KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
224	tp->rcv_wnd -= n;
225#endif
226
227	KASSERT(toep->sb_cc >= sb->sb_cc,
228	    ("%s: sb %p has more data (%d) than last time (%d).",
229	    __func__, sb, sb->sb_cc, toep->sb_cc));
230	toep->rx_credits += toep->sb_cc - sb->sb_cc;
231#ifdef USE_DDP_RX_FLOW_CONTROL
232	toep->rx_credits -= n;	/* adjust for F_RX_FC_DDP */
233#endif
234	sbappendstream_locked(sb, m);
235	toep->sb_cc = sb->sb_cc;
236}
237
238/* SET_TCB_FIELD sent as a ULP command looks like this */
239#define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
240    sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
241
242/* RX_DATA_ACK sent as a ULP command looks like this */
243#define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \
244    sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core))
245
246static inline void *
247mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep,
248    uint64_t word, uint64_t mask, uint64_t val)
249{
250	struct ulptx_idata *ulpsc;
251	struct cpl_set_tcb_field_core *req;
252
253	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
254	ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
255
256	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
257	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
258	ulpsc->len = htobe32(sizeof(*req));
259
260	req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
261	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid));
262	req->reply_ctrl = htobe16(V_NO_REPLY(1) |
263	    V_QUEUENO(toep->ofld_rxq->iq.abs_id));
264	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
265        req->mask = htobe64(mask);
266        req->val = htobe64(val);
267
268	ulpsc = (struct ulptx_idata *)(req + 1);
269	if (LEN__SET_TCB_FIELD_ULP % 16) {
270		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
271		ulpsc->len = htobe32(0);
272		return (ulpsc + 1);
273	}
274	return (ulpsc);
275}
276
277static inline void *
278mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep)
279{
280	struct ulptx_idata *ulpsc;
281	struct cpl_rx_data_ack_core *req;
282
283	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
284	ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16));
285
286	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
287	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
288	ulpsc->len = htobe32(sizeof(*req));
289
290	req = (struct cpl_rx_data_ack_core *)(ulpsc + 1);
291	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid));
292	req->credit_dack = htobe32(F_RX_MODULATE_RX);
293
294	ulpsc = (struct ulptx_idata *)(req + 1);
295	if (LEN__RX_DATA_ACK_ULP % 16) {
296		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
297		ulpsc->len = htobe32(0);
298		return (ulpsc + 1);
299	}
300	return (ulpsc);
301}
302
303static inline uint64_t
304select_ddp_flags(struct socket *so, int flags, int db_idx)
305{
306	uint64_t ddp_flags = V_TF_DDP_INDICATE_OUT(0);
307	int waitall = flags & MSG_WAITALL;
308	int nb = so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO);
309
310	KASSERT(db_idx == 0 || db_idx == 1,
311	    ("%s: bad DDP buffer index %d", __func__, db_idx));
312
313	if (db_idx == 0) {
314		ddp_flags |= V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0);
315		if (waitall)
316			ddp_flags |= V_TF_DDP_PUSH_DISABLE_0(1);
317		else if (nb)
318			ddp_flags |= V_TF_DDP_BUF0_FLUSH(1);
319		else
320			ddp_flags |= V_TF_DDP_BUF0_FLUSH(0);
321	} else {
322		ddp_flags |= V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1);
323		if (waitall)
324			ddp_flags |= V_TF_DDP_PUSH_DISABLE_1(1);
325		else if (nb)
326			ddp_flags |= V_TF_DDP_BUF1_FLUSH(1);
327		else
328			ddp_flags |= V_TF_DDP_BUF1_FLUSH(0);
329	}
330
331	return (ddp_flags);
332}
333
334static struct wrqe *
335mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
336    int offset, uint64_t ddp_flags)
337{
338	struct ddp_buffer *db = toep->db[db_idx];
339	struct wrqe *wr;
340	struct work_request_hdr *wrh;
341	struct ulp_txpkt *ulpmc;
342	int len;
343
344	KASSERT(db_idx == 0 || db_idx == 1,
345	    ("%s: bad DDP buffer index %d", __func__, db_idx));
346
347	/*
348	 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an
349	 * RX_DATA_ACK (with RX_MODULATE to speed up delivery).
350	 *
351	 * The work request header is 16B and always ends at a 16B boundary.
352	 * The ULPTX master commands that follow must all end at 16B boundaries
353	 * too so we round up the size to 16.
354	 */
355	len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) +
356	    roundup2(LEN__RX_DATA_ACK_ULP, 16);
357
358	wr = alloc_wrqe(len, toep->ctrlq);
359	if (wr == NULL)
360		return (NULL);
361	wrh = wrtod(wr);
362	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
363	ulpmc = (struct ulp_txpkt *)(wrh + 1);
364
365	/* Write the buffer's tag */
366	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
367	    W_TCB_RX_DDP_BUF0_TAG + db_idx,
368	    V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
369	    V_TCB_RX_DDP_BUF0_TAG(db->tag));
370
371	/* Update the current offset in the DDP buffer and its total length */
372	if (db_idx == 0)
373		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
374		    W_TCB_RX_DDP_BUF0_OFFSET,
375		    V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
376		    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
377		    V_TCB_RX_DDP_BUF0_OFFSET(offset) |
378		    V_TCB_RX_DDP_BUF0_LEN(db->len));
379	else
380		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
381		    W_TCB_RX_DDP_BUF1_OFFSET,
382		    V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
383		    V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
384		    V_TCB_RX_DDP_BUF1_OFFSET(offset) |
385		    V_TCB_RX_DDP_BUF1_LEN((u64)db->len << 32));
386
387	/* Update DDP flags */
388	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS,
389	    V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF1_FLUSH(1) |
390	    V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PUSH_DISABLE_1(1) |
391	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1) |
392	    V_TF_DDP_ACTIVE_BUF(1) | V_TF_DDP_INDICATE_OUT(1), ddp_flags);
393
394	/* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */
395	ulpmc = mk_rx_data_ack_ulp(ulpmc, toep);
396
397	return (wr);
398}
399
400static void
401discourage_ddp(struct toepcb *toep)
402{
403
404	if (toep->ddp_score && --toep->ddp_score == 0) {
405		toep->ddp_flags &= ~DDP_OK;
406		toep->ddp_disabled = time_uptime;
407		CTR3(KTR_CXGBE, "%s: tid %u !DDP_OK @ %u",
408		    __func__, toep->tid, time_uptime);
409	}
410}
411
412static int
413handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
414{
415	uint32_t report = be32toh(ddp_report);
416	unsigned int db_flag;
417	struct inpcb *inp = toep->inp;
418	struct tcpcb *tp;
419	struct socket *so;
420	struct sockbuf *sb;
421	struct mbuf *m;
422
423	db_flag = report & F_DDP_BUF_IDX ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
424
425	if (__predict_false(!(report & F_DDP_INV)))
426		CXGBE_UNIMPLEMENTED("DDP buffer still valid");
427
428	INP_WLOCK(inp);
429	so = inp_inpcbtosocket(inp);
430	sb = &so->so_rcv;
431	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
432
433		/*
434		 * XXX: think a bit more.
435		 * tcpcb probably gone, but socket should still be around
436		 * because we always wait for DDP completion in soreceive no
437		 * matter what.  Just wake it up and let it clean up.
438		 */
439
440		CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
441		    __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
442		SOCKBUF_LOCK(sb);
443		goto wakeup;
444	}
445
446	tp = intotcpcb(inp);
447	len += be32toh(rcv_nxt) - tp->rcv_nxt;
448	tp->rcv_nxt += len;
449	tp->t_rcvtime = ticks;
450#ifndef USE_DDP_RX_FLOW_CONTROL
451	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
452	tp->rcv_wnd -= len;
453#endif
454	m = get_ddp_mbuf(len);
455
456	SOCKBUF_LOCK(sb);
457	if (report & F_DDP_BUF_COMPLETE)
458		toep->ddp_score = DDP_HIGH_SCORE;
459	else
460		discourage_ddp(toep);
461
462	KASSERT(toep->sb_cc >= sb->sb_cc,
463	    ("%s: sb %p has more data (%d) than last time (%d).",
464	    __func__, sb, sb->sb_cc, toep->sb_cc));
465	toep->rx_credits += toep->sb_cc - sb->sb_cc;
466#ifdef USE_DDP_RX_FLOW_CONTROL
467	toep->rx_credits -= len;	/* adjust for F_RX_FC_DDP */
468#endif
469	sbappendstream_locked(sb, m);
470	toep->sb_cc = sb->sb_cc;
471wakeup:
472	KASSERT(toep->ddp_flags & db_flag,
473	    ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x, report 0x%x",
474	    __func__, toep, toep->ddp_flags, report));
475	toep->ddp_flags &= ~db_flag;
476	sorwakeup_locked(so);
477	SOCKBUF_UNLOCK_ASSERT(sb);
478
479	INP_WUNLOCK(inp);
480	return (0);
481}
482
483#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
484	 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
485	 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
486	 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)
487
488static int
489do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
490{
491	struct adapter *sc = iq->adapter;
492	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
493	unsigned int tid = GET_TID(cpl);
494	uint32_t vld;
495	struct toepcb *toep = lookup_tid(sc, tid);
496
497	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
498	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
499	KASSERT(!(toep->flags & TPF_SYNQE),
500	    ("%s: toep %p claims to be a synq entry", __func__, toep));
501
502	vld = be32toh(cpl->ddpvld);
503	if (__predict_false(vld & DDP_ERR)) {
504		panic("%s: DDP error 0x%x (tid %d, toep %p)",
505		    __func__, vld, tid, toep);
506	}
507
508	handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len));
509
510	return (0);
511}
512
513static int
514do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss,
515    struct mbuf *m)
516{
517	struct adapter *sc = iq->adapter;
518	const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1);
519	unsigned int tid = GET_TID(cpl);
520	struct toepcb *toep = lookup_tid(sc, tid);
521
522	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
523	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
524	KASSERT(!(toep->flags & TPF_SYNQE),
525	    ("%s: toep %p claims to be a synq entry", __func__, toep));
526
527	handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0);
528
529	return (0);
530}
531
532void
533enable_ddp(struct adapter *sc, struct toepcb *toep)
534{
535
536	KASSERT((toep->ddp_flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
537	    ("%s: toep %p has bad ddp_flags 0x%x",
538	    __func__, toep, toep->ddp_flags));
539
540	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
541	    __func__, toep->tid, time_uptime);
542
543	toep->ddp_flags |= DDP_SC_REQ;
544	t4_set_tcb_field(sc, toep, 1, W_TCB_RX_DDP_FLAGS,
545	    V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
546	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
547	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1),
548	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1));
549	t4_set_tcb_field(sc, toep, 1, W_TCB_T_FLAGS,
550	    V_TF_RCV_COALESCE_ENABLE(1), 0);
551}
552
553static inline void
554disable_ddp(struct adapter *sc, struct toepcb *toep)
555{
556
557	KASSERT((toep->ddp_flags & (DDP_ON | DDP_SC_REQ)) == DDP_ON,
558	    ("%s: toep %p has bad ddp_flags 0x%x",
559	    __func__, toep, toep->ddp_flags));
560
561	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
562	    __func__, toep->tid, time_uptime);
563
564	toep->ddp_flags |= DDP_SC_REQ;
565	t4_set_tcb_field(sc, toep, 1, W_TCB_T_FLAGS,
566	    V_TF_RCV_COALESCE_ENABLE(1), V_TF_RCV_COALESCE_ENABLE(1));
567	t4_set_tcb_field(sc, toep, 1, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
568	    V_TF_DDP_OFF(1));
569}
570
571static int
572hold_uio(struct uio *uio, vm_page_t **ppages, int *pnpages)
573{
574	struct vm_map *map;
575	struct iovec *iov;
576	vm_offset_t start, end;
577	vm_page_t *pp;
578	int n;
579
580	KASSERT(uio->uio_iovcnt == 1,
581	    ("%s: uio_iovcnt %d", __func__, uio->uio_iovcnt));
582	KASSERT(uio->uio_td->td_proc == curproc,
583	    ("%s: uio proc (%p) is not curproc (%p)",
584	    __func__, uio->uio_td->td_proc, curproc));
585
586	map = &curproc->p_vmspace->vm_map;
587	iov = &uio->uio_iov[0];
588	start = trunc_page((uintptr_t)iov->iov_base);
589	end = round_page((vm_offset_t)iov->iov_base + iov->iov_len);
590	n = howmany(end - start, PAGE_SIZE);
591
592	if (end - start > MAX_DDP_BUFFER_SIZE)
593		return (E2BIG);
594
595	pp = malloc(n * sizeof(vm_page_t), M_CXGBE, M_NOWAIT);
596	if (pp == NULL)
597		return (ENOMEM);
598
599	if (vm_fault_quick_hold_pages(map, (vm_offset_t)iov->iov_base,
600	    iov->iov_len, VM_PROT_WRITE, pp, n) < 0) {
601		free(pp, M_CXGBE);
602		return (EFAULT);
603	}
604
605	*ppages = pp;
606	*pnpages = n;
607
608	return (0);
609}
610
611static int
612bufcmp(struct ddp_buffer *db, vm_page_t *pages, int npages, int offset, int len)
613{
614	int i;
615
616	if (db == NULL || db->npages != npages || db->offset != offset ||
617	    db->len != len)
618		return (1);
619
620	for (i = 0; i < npages; i++) {
621		if (pages[i]->phys_addr != db->pages[i]->phys_addr)
622			return (1);
623	}
624
625	return (0);
626}
627
628static int
629calculate_hcf(int n1, int n2)
630{
631	int a, b, t;
632
633	if (n1 <= n2) {
634		a = n1;
635		b = n2;
636	} else {
637		a = n2;
638		b = n1;
639	}
640
641	while (a != 0) {
642		t = a;
643		a = b % a;
644		b = t;
645	}
646
647	return (b);
648}
649
650static struct ddp_buffer *
651alloc_ddp_buffer(struct tom_data *td, vm_page_t *pages, int npages, int offset,
652    int len)
653{
654	int i, hcf, seglen, idx, ppod, nppods;
655	struct ddp_buffer *db;
656
657	/*
658	 * The DDP page size is unrelated to the VM page size.  We combine
659	 * contiguous physical pages into larger segments to get the best DDP
660	 * page size possible.  This is the largest of the four sizes in
661	 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in
662	 * the page list.
663	 */
664	hcf = 0;
665	for (i = 0; i < npages; i++) {
666		seglen = PAGE_SIZE;
667		while (i < npages - 1 &&
668		    pages[i]->phys_addr + PAGE_SIZE == pages[i + 1]->phys_addr) {
669			seglen += PAGE_SIZE;
670			i++;
671		}
672
673		hcf = calculate_hcf(hcf, seglen);
674		if (hcf < t4_ddp_pgsz[1]) {
675			idx = 0;
676			goto have_pgsz;	/* give up, short circuit */
677		}
678	}
679
680	if (hcf % t4_ddp_pgsz[0] != 0) {
681		/* hmmm.  This could only happen when PAGE_SIZE < 4K */
682		KASSERT(PAGE_SIZE < 4096,
683		    ("%s: PAGE_SIZE %d, hcf %d", __func__, PAGE_SIZE, hcf));
684		CTR3(KTR_CXGBE, "%s: PAGE_SIZE %d, hcf %d",
685		    __func__, PAGE_SIZE, hcf);
686		return (NULL);
687	}
688
689	for (idx = nitems(t4_ddp_pgsz) - 1; idx > 0; idx--) {
690		if (hcf % t4_ddp_pgsz[idx] == 0)
691			break;
692	}
693have_pgsz:
694
695	db = malloc(sizeof(*db), M_CXGBE, M_NOWAIT);
696	if (db == NULL) {
697		CTR1(KTR_CXGBE, "%s: malloc failed.", __func__);
698		return (NULL);
699	}
700
701	nppods = pages_to_nppods(npages, t4_ddp_pgsz[idx]);
702	ppod = alloc_ppods(td, nppods, &db->ppod_region);
703	if (ppod < 0) {
704		free(db, M_CXGBE);
705		CTR4(KTR_CXGBE, "%s: no pods, nppods %d, resid %d, pgsz %d",
706		    __func__, nppods, len, t4_ddp_pgsz[idx]);
707		return (NULL);
708	}
709
710	KASSERT(idx <= M_PPOD_PGSZ && ppod <= M_PPOD_TAG,
711	    ("%s: DDP pgsz_idx = %d, ppod = %d", __func__, idx, ppod));
712
713	db->tag = V_PPOD_PGSZ(idx) | V_PPOD_TAG(ppod);
714	db->nppods = nppods;
715	db->npages = npages;
716	db->pages = pages;
717	db->offset = offset;
718	db->len = len;
719
720	CTR6(KTR_CXGBE, "New DDP buffer.  "
721	    "ddp_pgsz %d, ppod 0x%x, npages %d, nppods %d, offset %d, len %d",
722	    t4_ddp_pgsz[idx], ppod, db->npages, db->nppods, db->offset,
723	    db->len);
724
725	return (db);
726}
727
728#define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE)
729
730static int
731write_page_pods(struct adapter *sc, struct toepcb *toep, struct ddp_buffer *db)
732{
733	struct wrqe *wr;
734	struct ulp_mem_io *ulpmc;
735	struct ulptx_idata *ulpsc;
736	struct pagepod *ppod;
737	int i, j, k, n, chunk, len, ddp_pgsz, idx, ppod_addr;
738	uint32_t cmd;
739
740	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
741	if (is_t4(sc))
742		cmd |= htobe32(F_ULP_MEMIO_ORDER);
743	else
744		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
745	ddp_pgsz = t4_ddp_pgsz[G_PPOD_PGSZ(db->tag)];
746	ppod_addr = sc->vres.ddp.start + G_PPOD_TAG(db->tag) * PPOD_SIZE;
747	for (i = 0; i < db->nppods; ppod_addr += chunk) {
748
749		/* How many page pods are we writing in this cycle */
750		n = min(db->nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
751		chunk = PPOD_SZ(n);
752		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
753
754		wr = alloc_wrqe(len, toep->ctrlq);
755		if (wr == NULL)
756			return (ENOMEM);	/* ok to just bail out */
757		ulpmc = wrtod(wr);
758
759		INIT_ULPTX_WR(ulpmc, len, 0, 0);
760		ulpmc->cmd = cmd;
761		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
762		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
763		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
764
765		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
766		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
767		ulpsc->len = htobe32(chunk);
768
769		ppod = (struct pagepod *)(ulpsc + 1);
770		for (j = 0; j < n; i++, j++, ppod++) {
771			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
772			    V_PPOD_TID(toep->tid) | db->tag);
773			ppod->len_offset = htobe64(V_PPOD_LEN(db->len) |
774			    V_PPOD_OFST(db->offset));
775			ppod->rsvd = 0;
776			idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
777			for (k = 0; k < nitems(ppod->addr); k++) {
778				if (idx < db->npages) {
779					ppod->addr[k] =
780					    htobe64(db->pages[idx]->phys_addr);
781					idx += ddp_pgsz / PAGE_SIZE;
782				} else
783					ppod->addr[k] = 0;
784#if 0
785				CTR5(KTR_CXGBE,
786				    "%s: tid %d ppod[%d]->addr[%d] = %p",
787				    __func__, toep->tid, i, k,
788				    htobe64(ppod->addr[k]));
789#endif
790			}
791
792		}
793
794		t4_wrq_tx(sc, wr);
795	}
796
797	return (0);
798}
799
800/*
801 * Reuse, or allocate (and program the page pods for) a new DDP buffer.  The
802 * "pages" array is handed over to this function and should not be used in any
803 * way by the caller after that.
804 */
805static int
806select_ddp_buffer(struct adapter *sc, struct toepcb *toep, vm_page_t *pages,
807    int npages, int db_off, int db_len)
808{
809	struct ddp_buffer *db;
810	struct tom_data *td = sc->tom_softc;
811	int i, empty_slot = -1;
812
813	/* Try to reuse */
814	for (i = 0; i < nitems(toep->db); i++) {
815		if (bufcmp(toep->db[i], pages, npages, db_off, db_len) == 0) {
816			free(pages, M_CXGBE);
817			return (i);	/* pages still held */
818		} else if (toep->db[i] == NULL && empty_slot < 0)
819			empty_slot = i;
820	}
821
822	/* Allocate new buffer, write its page pods. */
823	db = alloc_ddp_buffer(td, pages, npages, db_off, db_len);
824	if (db == NULL) {
825		vm_page_unhold_pages(pages, npages);
826		free(pages, M_CXGBE);
827		return (-1);
828	}
829	if (write_page_pods(sc, toep, db) != 0) {
830		vm_page_unhold_pages(pages, npages);
831		free_ddp_buffer(td, db);
832		return (-1);
833	}
834
835	i = empty_slot;
836	if (i < 0) {
837		i = arc4random() % nitems(toep->db);
838		free_ddp_buffer(td, toep->db[i]);
839	}
840	toep->db[i] = db;
841
842	CTR5(KTR_CXGBE, "%s: tid %d, DDP buffer[%d] = %p (tag 0x%x)",
843	    __func__, toep->tid, i, db, db->tag);
844
845	return (i);
846}
847
848static void
849wire_ddp_buffer(struct ddp_buffer *db)
850{
851	int i;
852	vm_page_t p;
853
854	for (i = 0; i < db->npages; i++) {
855		p = db->pages[i];
856		vm_page_lock(p);
857		vm_page_wire(p);
858		vm_page_unhold(p);
859		vm_page_unlock(p);
860	}
861}
862
863static void
864unwire_ddp_buffer(struct ddp_buffer *db)
865{
866	int i;
867	vm_page_t p;
868
869	for (i = 0; i < db->npages; i++) {
870		p = db->pages[i];
871		vm_page_lock(p);
872		vm_page_unwire(p, 0);
873		vm_page_unlock(p);
874	}
875}
876
877static int
878handle_ddp(struct socket *so, struct uio *uio, int flags, int error)
879{
880	struct sockbuf *sb = &so->so_rcv;
881	struct tcpcb *tp = so_sototcpcb(so);
882	struct toepcb *toep = tp->t_toe;
883	struct adapter *sc = td_adapter(toep->td);
884	vm_page_t *pages;
885	int npages, db_idx, rc, buf_flag;
886	struct ddp_buffer *db;
887	struct wrqe *wr;
888	uint64_t ddp_flags;
889
890	SOCKBUF_LOCK_ASSERT(sb);
891
892#if 0
893	if (sb->sb_cc + sc->tt.ddp_thres > uio->uio_resid) {
894		CTR4(KTR_CXGBE, "%s: sb_cc %d, threshold %d, resid %d",
895		    __func__, sb->sb_cc, sc->tt.ddp_thres, uio->uio_resid);
896	}
897#endif
898
899	/* XXX: too eager to disable DDP, could handle NBIO better than this. */
900	if (sb->sb_cc >= uio->uio_resid || uio->uio_resid < sc->tt.ddp_thres ||
901	    uio->uio_resid > MAX_DDP_BUFFER_SIZE || uio->uio_iovcnt > 1 ||
902	    so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO) ||
903	    error || so->so_error || sb->sb_state & SBS_CANTRCVMORE)
904		goto no_ddp;
905
906	/*
907	 * Fault in and then hold the pages of the uio buffers.  We'll wire them
908	 * a bit later if everything else works out.
909	 */
910	SOCKBUF_UNLOCK(sb);
911	if (hold_uio(uio, &pages, &npages) != 0) {
912		SOCKBUF_LOCK(sb);
913		goto no_ddp;
914	}
915	SOCKBUF_LOCK(sb);
916	if (__predict_false(so->so_error || sb->sb_state & SBS_CANTRCVMORE)) {
917		vm_page_unhold_pages(pages, npages);
918		free(pages, M_CXGBE);
919		goto no_ddp;
920	}
921
922	/*
923	 * Figure out which one of the two DDP buffers to use this time.
924	 */
925	db_idx = select_ddp_buffer(sc, toep, pages, npages,
926	    (uintptr_t)uio->uio_iov->iov_base & PAGE_MASK, uio->uio_resid);
927	pages = NULL;	/* handed off to select_ddp_buffer */
928	if (db_idx < 0)
929		goto no_ddp;
930	db = toep->db[db_idx];
931	buf_flag = db_idx == 0 ? DDP_BUF0_ACTIVE : DDP_BUF1_ACTIVE;
932
933	/*
934	 * Build the compound work request that tells the chip where to DMA the
935	 * payload.
936	 */
937	ddp_flags = select_ddp_flags(so, flags, db_idx);
938	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, sb->sb_cc, ddp_flags);
939	if (wr == NULL) {
940		/*
941		 * Just unhold the pages.  The DDP buffer's software state is
942		 * left as-is in the toep.  The page pods were written
943		 * successfully and we may have an opportunity to use it in the
944		 * future.
945		 */
946		vm_page_unhold_pages(db->pages, db->npages);
947		goto no_ddp;
948	}
949
950	/* Wire (and then unhold) the pages, and give the chip the go-ahead. */
951	wire_ddp_buffer(db);
952	t4_wrq_tx(sc, wr);
953	sb->sb_flags &= ~SB_DDP_INDICATE;
954	toep->ddp_flags |= buf_flag;
955
956	/*
957	 * Wait for the DDP operation to complete and then unwire the pages.
958	 * The return code from the sbwait will be the final return code of this
959	 * function.  But we do need to wait for DDP no matter what.
960	 */
961	rc = sbwait(sb);
962	while (toep->ddp_flags & buf_flag) {
963		sb->sb_flags |= SB_WAIT;
964		msleep(&sb->sb_cc, &sb->sb_mtx, PSOCK , "sbwait", 0);
965	}
966	unwire_ddp_buffer(db);
967	return (rc);
968no_ddp:
969	disable_ddp(sc, toep);
970	discourage_ddp(toep);
971	sb->sb_flags &= ~SB_DDP_INDICATE;
972	return (0);
973}
974
975void
976t4_init_ddp(struct adapter *sc, struct tom_data *td)
977{
978	int nppods = sc->vres.ddp.size / PPOD_SIZE;
979
980	td->nppods = nppods;
981	td->nppods_free = nppods;
982	td->nppods_free_head = nppods;
983	TAILQ_INIT(&td->ppods);
984	mtx_init(&td->ppod_lock, "page pods", NULL, MTX_DEF);
985
986	t4_register_cpl_handler(sc, CPL_RX_DATA_DDP, do_rx_data_ddp);
987	t4_register_cpl_handler(sc, CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
988}
989
990void
991t4_uninit_ddp(struct adapter *sc __unused, struct tom_data *td)
992{
993
994	KASSERT(td->nppods == td->nppods_free,
995	    ("%s: page pods still in use, nppods = %d, free = %d",
996	    __func__, td->nppods, td->nppods_free));
997
998	if (mtx_initialized(&td->ppod_lock))
999		mtx_destroy(&td->ppod_lock);
1000}
1001
1002#define	VNET_SO_ASSERT(so)						\
1003	VNET_ASSERT(curvnet != NULL,					\
1004	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
1005#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1006static int
1007soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1008{
1009
1010	CXGBE_UNIMPLEMENTED(__func__);
1011}
1012
1013static char ddp_magic_str[] = "nothing to see here";
1014
1015struct mbuf *
1016get_ddp_mbuf(int len)
1017{
1018	struct mbuf *m;
1019
1020	m = m_get(M_NOWAIT, MT_DATA);
1021	if (m == NULL)
1022		CXGBE_UNIMPLEMENTED("mbuf alloc failure");
1023	m->m_len = len;
1024	m->m_data = &ddp_magic_str[0];
1025
1026	return (m);
1027}
1028
1029static inline int
1030is_ddp_mbuf(struct mbuf *m)
1031{
1032
1033	return (m->m_data == &ddp_magic_str[0]);
1034}
1035
1036/*
1037 * Copy an mbuf chain into a uio limited by len if set.
1038 */
1039static int
1040m_mbuftouio_ddp(struct uio *uio, struct mbuf *m, int len)
1041{
1042	int error, length, total;
1043	int progress = 0;
1044
1045	if (len > 0)
1046		total = min(uio->uio_resid, len);
1047	else
1048		total = uio->uio_resid;
1049
1050	/* Fill the uio with data from the mbufs. */
1051	for (; m != NULL; m = m->m_next) {
1052		length = min(m->m_len, total - progress);
1053
1054		if (is_ddp_mbuf(m)) {
1055			enum uio_seg segflag = uio->uio_segflg;
1056
1057			uio->uio_segflg	= UIO_NOCOPY;
1058			error = uiomove(mtod(m, void *), length, uio);
1059			uio->uio_segflg	= segflag;
1060		} else
1061			error = uiomove(mtod(m, void *), length, uio);
1062		if (error)
1063			return (error);
1064
1065		progress += length;
1066	}
1067
1068	return (0);
1069}
1070
1071/*
1072 * Based on soreceive_stream() in uipc_socket.c
1073 */
1074int
1075t4_soreceive_ddp(struct socket *so, struct sockaddr **psa, struct uio *uio,
1076    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1077{
1078	int len = 0, error = 0, flags, oresid, ddp_handled = 0;
1079	struct sockbuf *sb;
1080	struct mbuf *m, *n = NULL;
1081
1082	/* We only do stream sockets. */
1083	if (so->so_type != SOCK_STREAM)
1084		return (EINVAL);
1085	if (psa != NULL)
1086		*psa = NULL;
1087	if (controlp != NULL)
1088		return (EINVAL);
1089	if (flagsp != NULL)
1090		flags = *flagsp &~ MSG_EOR;
1091	else
1092		flags = 0;
1093	if (flags & MSG_OOB)
1094		return (soreceive_rcvoob(so, uio, flags));
1095	if (mp0 != NULL)
1096		*mp0 = NULL;
1097
1098	sb = &so->so_rcv;
1099
1100	/* Prevent other readers from entering the socket. */
1101	error = sblock(sb, SBLOCKWAIT(flags));
1102	if (error)
1103		goto out;
1104	SOCKBUF_LOCK(sb);
1105
1106	/* Easy one, no space to copyout anything. */
1107	if (uio->uio_resid == 0) {
1108		error = EINVAL;
1109		goto out;
1110	}
1111	oresid = uio->uio_resid;
1112
1113	/* We will never ever get anything unless we are or were connected. */
1114	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1115		error = ENOTCONN;
1116		goto out;
1117	}
1118
1119restart:
1120	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1121
1122	if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) {
1123
1124		/* uio should be just as it was at entry */
1125		KASSERT(oresid == uio->uio_resid,
1126		    ("%s: oresid = %d, uio_resid = %zd, sb_cc = %d",
1127		    __func__, oresid, uio->uio_resid, sb->sb_cc));
1128
1129		error = handle_ddp(so, uio, flags, 0);
1130		ddp_handled = 1;
1131		if (error)
1132			goto out;
1133	}
1134
1135	/* Abort if socket has reported problems. */
1136	if (so->so_error) {
1137		if (sb->sb_cc > 0)
1138			goto deliver;
1139		if (oresid > uio->uio_resid)
1140			goto out;
1141		error = so->so_error;
1142		if (!(flags & MSG_PEEK))
1143			so->so_error = 0;
1144		goto out;
1145	}
1146
1147	/* Door is closed.  Deliver what is left, if any. */
1148	if (sb->sb_state & SBS_CANTRCVMORE) {
1149		if (sb->sb_cc > 0)
1150			goto deliver;
1151		else
1152			goto out;
1153	}
1154
1155	/* Socket buffer is empty and we shall not block. */
1156	if (sb->sb_cc == 0 &&
1157	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1158		error = EAGAIN;
1159		goto out;
1160	}
1161
1162	/* Socket buffer got some data that we shall deliver now. */
1163	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
1164	    ((sb->sb_flags & SS_NBIO) ||
1165	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1166	     sb->sb_cc >= sb->sb_lowat ||
1167	     sb->sb_cc >= uio->uio_resid ||
1168	     sb->sb_cc >= sb->sb_hiwat) ) {
1169		goto deliver;
1170	}
1171
1172	/* On MSG_WAITALL we must wait until all data or error arrives. */
1173	if ((flags & MSG_WAITALL) &&
1174	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
1175		goto deliver;
1176
1177	/*
1178	 * Wait and block until (more) data comes in.
1179	 * NB: Drops the sockbuf lock during wait.
1180	 */
1181	error = sbwait(sb);
1182	if (error) {
1183		if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) {
1184			(void) handle_ddp(so, uio, flags, 1);
1185			ddp_handled = 1;
1186		}
1187		goto out;
1188	}
1189	goto restart;
1190
1191deliver:
1192	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1193	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
1194	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1195
1196	if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled)
1197		goto restart;
1198
1199	/* Statistics. */
1200	if (uio->uio_td)
1201		uio->uio_td->td_ru.ru_msgrcv++;
1202
1203	/* Fill uio until full or current end of socket buffer is reached. */
1204	len = min(uio->uio_resid, sb->sb_cc);
1205	if (mp0 != NULL) {
1206		/* Dequeue as many mbufs as possible. */
1207		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1208			for (*mp0 = m = sb->sb_mb;
1209			     m != NULL && m->m_len <= len;
1210			     m = m->m_next) {
1211				len -= m->m_len;
1212				uio->uio_resid -= m->m_len;
1213				sbfree(sb, m);
1214				n = m;
1215			}
1216			sb->sb_mb = m;
1217			if (sb->sb_mb == NULL)
1218				SB_EMPTY_FIXUP(sb);
1219			n->m_next = NULL;
1220		}
1221		/* Copy the remainder. */
1222		if (len > 0) {
1223			KASSERT(sb->sb_mb != NULL,
1224			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1225
1226			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1227			if (m == NULL)
1228				len = 0;	/* Don't flush data from sockbuf. */
1229			else
1230				uio->uio_resid -= m->m_len;
1231			if (*mp0 != NULL)
1232				n->m_next = m;
1233			else
1234				*mp0 = m;
1235			if (*mp0 == NULL) {
1236				error = ENOBUFS;
1237				goto out;
1238			}
1239		}
1240	} else {
1241		/* NB: Must unlock socket buffer as uiomove may sleep. */
1242		SOCKBUF_UNLOCK(sb);
1243		error = m_mbuftouio_ddp(uio, sb->sb_mb, len);
1244		SOCKBUF_LOCK(sb);
1245		if (error)
1246			goto out;
1247	}
1248	SBLASTRECORDCHK(sb);
1249	SBLASTMBUFCHK(sb);
1250
1251	/*
1252	 * Remove the delivered data from the socket buffer unless we
1253	 * were only peeking.
1254	 */
1255	if (!(flags & MSG_PEEK)) {
1256		if (len > 0)
1257			sbdrop_locked(sb, len);
1258
1259		/* Notify protocol that we drained some data. */
1260		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
1261		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
1262		     !(flags & MSG_SOCALLBCK))) {
1263			SOCKBUF_UNLOCK(sb);
1264			VNET_SO_ASSERT(so);
1265			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
1266			SOCKBUF_LOCK(sb);
1267		}
1268	}
1269
1270	/*
1271	 * For MSG_WAITALL we may have to loop again and wait for
1272	 * more data to come in.
1273	 */
1274	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1275		goto restart;
1276out:
1277	SOCKBUF_LOCK_ASSERT(sb);
1278	SBLASTRECORDCHK(sb);
1279	SBLASTMBUFCHK(sb);
1280	SOCKBUF_UNLOCK(sb);
1281	sbunlock(sb);
1282	return (error);
1283}
1284
1285#endif
1286