1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26#include <sys/types.h>
27#include <sys/param.h>
28#include <sys/cmn_err.h>
29#include <sys/uio.h>
30#include <sys/stropts.h>
31#include <sys/strsun.h>
32#include <sys/systm.h>
33#include <sys/socketvar.h>
34#include <fs/sockfs/sodirect.h>
35
36/*
37 * In support of on-board asynchronous DMA hardware (e.g. Intel I/OAT)
38 * we use a consolidation private KAPI to allow the protocol to start
39 * an asynchronous copyout to a user-land receive-side buffer (uioa)
40 * when a blocking socket read (e.g. read, recv, ...) is pending.
41 *
42 * In some broad strokes, this is what happens. When recv is called,
43 * we first determine whether it would be beneficial to use uioa, and
44 * if so set up the required state (all done by sod_rcv_init()).
45 * The protocol can only initiate asynchronous copyout if the receive
46 * queue is empty, so the first thing we do is drain any previously
47 * queued data (using sod_uioa_so_init()). Once the copyouts (if any)
48 * have been scheduled we wait for the receive to be satisfied. During
49 * that time any new mblks that are enqueued will be scheduled to be
50 * copied out asynchronously (sod_uioa_mblk_init()). When the receive
51 * has been satisfied we wait for all scheduled copyout operations to
52 * complete before we return to the user (sod_rcv_done())
53 */
54
55static struct kmem_cache *sock_sod_cache;
56
57/*
58 * This function is called at the beginning of recvmsg().
59 *
60 * If I/OAT is enabled on this sonode, initialize the uioa state machine
61 * with state UIOA_ALLOC.
62 */
63uio_t *
64sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp)
65{
66	struct uio *suiop;
67	struct uio *uiop;
68	sodirect_t *sodp = so->so_direct;
69
70	if (sodp == NULL)
71		return (NULL);
72
73	suiop = NULL;
74	uiop = *uiopp;
75
76	mutex_enter(&so->so_lock);
77	if (uiop->uio_resid >= uioasync.mincnt &&
78	    sodp != NULL && sodp->sod_enabled &&
79	    uioasync.enabled && !(flags & MSG_PEEK) &&
80	    !so->so_proto_props.sopp_loopback && so->so_filter_active == 0 &&
81	    !(so->so_state & SS_CANTRCVMORE)) {
82		/*
83		 * Big enough I/O for uioa min setup and an sodirect socket
84		 * and sodirect enabled and uioa enabled and I/O will be done
85		 * and not EOF so initialize the sodirect_t uioa_t with "uiop".
86		 */
87		if (!uioainit(uiop, &sodp->sod_uioa)) {
88			/*
89			 * Successful uioainit() so the uio_t part of the
90			 * uioa_t will be used for all uio_t work to follow,
91			 * we return the original "uiop" in "suiop".
92			 */
93			suiop = uiop;
94			*uiopp = (uio_t *)&sodp->sod_uioa;
95			/*
96			 * Before returning to the caller the passed in uio_t
97			 * "uiop" will be updated via a call to uioafini()
98			 * below.
99			 *
100			 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
101			 * here as first we have to uioamove() any currently
102			 * queued M_DATA mblk_t(s) so it will be done later.
103			 */
104		}
105	}
106	mutex_exit(&so->so_lock);
107
108	return (suiop);
109}
110
111/*
112 * This function is called at the end of recvmsg(), it finializes all the I/OAT
113 * operations, and reset the uioa state to UIOA_ALLOC.
114 */
115int
116sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop)
117{
118	int error = 0;
119	sodirect_t *sodp = so->so_direct;
120	mblk_t *mp;
121
122	if (sodp == NULL) {
123		return (0);
124	}
125
126	ASSERT(MUTEX_HELD(&so->so_lock));
127	/* Finish any sodirect and uioa processing */
128	if (suiop != NULL) {
129		/* Finish any uioa_t processing */
130
131		ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
132		error = uioafini(suiop, (uioa_t *)uiop);
133		if ((mp = sodp->sod_uioafh) != NULL) {
134			sodp->sod_uioafh = NULL;
135			sodp->sod_uioaft = NULL;
136			freemsg(mp);
137		}
138	}
139	ASSERT(sodp->sod_uioafh == NULL);
140
141	return (error);
142}
143
144/*
145 * Schedule a uioamove() on a mblk. This is done as mblks are enqueued
146 * by the protocol on the socket's rcv queue.
147 *
148 * Caller must be holding so_lock.
149 */
150void
151sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size)
152{
153	uioa_t *uioap = &sodp->sod_uioa;
154	mblk_t *mp1 = mp;
155	mblk_t *lmp = NULL;
156
157	ASSERT(DB_TYPE(mp) == M_DATA);
158	ASSERT(msg_size == msgdsize(mp));
159
160	if (uioap->uioa_state & UIOA_ENABLED) {
161		/* Uioa is enabled */
162
163		if (msg_size > uioap->uio_resid) {
164			/*
165			 * There isn't enough uio space for the mblk_t chain
166			 * so disable uioa such that this and any additional
167			 * mblk_t data is handled by the socket and schedule
168			 * the socket for wakeup to finish this uioa.
169			 */
170			uioap->uioa_state &= UIOA_CLR;
171			uioap->uioa_state |= UIOA_FINI;
172			return;
173		}
174		do {
175			uint32_t	len = MBLKL(mp1);
176
177			if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) {
178				/* Scheduled, mark dblk_t as such */
179				DB_FLAGS(mp1) |= DBLK_UIOA;
180			} else {
181				/* Error, turn off async processing */
182				uioap->uioa_state &= UIOA_CLR;
183				uioap->uioa_state |= UIOA_FINI;
184				break;
185			}
186			lmp = mp1;
187		} while ((mp1 = mp1->b_cont) != NULL);
188
189		if (mp1 != NULL || uioap->uio_resid == 0) {
190			/* Break the mblk chain if neccessary. */
191			if (mp1 != NULL && lmp != NULL) {
192				mp->b_next = mp1;
193				lmp->b_cont = NULL;
194			}
195		}
196	}
197}
198
199/*
200 * This function is called on a mblk that thas been successfully uioamoved().
201 */
202void
203sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp)
204{
205	if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) {
206		/*
207		 * A uioa flaged mblk_t chain, already uio processed,
208		 * add it to the sodirect uioa pending free list.
209		 *
210		 * Note, a b_cont chain headed by a DBLK_UIOA enable
211		 * mblk_t must have all mblk_t(s) DBLK_UIOA enabled.
212		 */
213		mblk_t	*bpt = sodp->sod_uioaft;
214
215		ASSERT(sodp != NULL);
216
217		/*
218		 * Add first mblk_t of "bp" chain to current sodirect uioa
219		 * free list tail mblk_t, if any, else empty list so new head.
220		 */
221		if (bpt == NULL)
222			sodp->sod_uioafh = bp;
223		else
224			bpt->b_cont = bp;
225
226		/*
227		 * Walk mblk_t "bp" chain to find tail and adjust rptr of
228		 * each to reflect that uioamove() has consumed all data.
229		 */
230		bpt = bp;
231		for (;;) {
232			ASSERT(bpt->b_datap->db_flags & DBLK_UIOA);
233
234			bpt->b_rptr = bpt->b_wptr;
235			if (bpt->b_cont == NULL)
236				break;
237			bpt = bpt->b_cont;
238		}
239		/* New sodirect uioa free list tail */
240		sodp->sod_uioaft = bpt;
241
242		/* Only dequeue once with data returned per uioa_t */
243		if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
244			sodp->sod_uioa.uioa_state &= UIOA_CLR;
245			sodp->sod_uioa.uioa_state |= UIOA_FINI;
246		}
247	}
248}
249
250/*
251 * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call
252 * this function on a non-STREAMS socket to schedule uioamove() on the data
253 * that has already queued in this socket.
254 */
255void
256sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop)
257{
258	uioa_t	*uioap = (uioa_t *)uiop;
259	mblk_t	*lbp;
260	mblk_t	*wbp;
261	mblk_t	*bp;
262	int	len;
263	int	error;
264	boolean_t in_rcv_q = B_TRUE;
265
266	ASSERT(MUTEX_HELD(&so->so_lock));
267	ASSERT(&sodp->sod_uioa == uioap);
268
269	/*
270	 * Walk first b_cont chain in sod_q
271	 * and schedule any M_DATA mblk_t's for uio asynchronous move.
272	 */
273	bp = so->so_rcv_q_head;
274
275again:
276	/* Walk the chain */
277	lbp = NULL;
278	wbp = bp;
279
280	do {
281		if (bp == NULL)
282			break;
283
284		if (wbp->b_datap->db_type != M_DATA) {
285			/* Not M_DATA, no more uioa */
286			goto nouioa;
287		}
288		if ((len = wbp->b_wptr - wbp->b_rptr) > 0) {
289			/* Have a M_DATA mblk_t with data */
290			if (len > uioap->uio_resid || (so->so_oobmark > 0 &&
291			    len + uioap->uioa_mbytes >= so->so_oobmark)) {
292				/* Not enough uio sapce, or beyond oobmark */
293				goto nouioa;
294			}
295			ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA));
296			error = uioamove(wbp->b_rptr, len,
297			    UIO_READ, uioap);
298			if (!error) {
299				/* Scheduled, mark dblk_t as such */
300				wbp->b_datap->db_flags |= DBLK_UIOA;
301			} else {
302				/* Break the mblk chain */
303				goto nouioa;
304			}
305		}
306		/* Save last wbp processed */
307		lbp = wbp;
308	} while ((wbp = wbp->b_cont) != NULL);
309
310	if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) {
311		/*
312		 * We get here only once to process the sonode dump area
313		 * if so_rcv_q_head is NULL or all the mblks have been
314		 * successfully uioamoved()ed.
315		 */
316		in_rcv_q = B_FALSE;
317
318		/* move to dump area */
319		bp = so->so_rcv_head;
320		goto again;
321	}
322
323	return;
324
325nouioa:
326	/* No more uioa */
327	uioap->uioa_state &= UIOA_CLR;
328	uioap->uioa_state |= UIOA_FINI;
329
330	/*
331	 * If we processed 1 or more mblk_t(s) then we need to split the
332	 * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s)
333	 * are in the current chain and the rest are in the following new
334	 * chain.
335	 */
336	if (lbp != NULL) {
337		/* New end of current chain */
338		lbp->b_cont = NULL;
339
340		/* Insert new chain wbp after bp */
341		if ((wbp->b_next = bp->b_next) == NULL) {
342			if (in_rcv_q)
343				so->so_rcv_q_last_head = wbp;
344			else
345				so->so_rcv_last_head = wbp;
346		}
347		bp->b_next = wbp;
348		bp->b_next->b_prev = bp->b_prev;
349		bp->b_prev = lbp;
350	}
351}
352
353/*
354 * Initialize sodirect data structures on a socket.
355 */
356void
357sod_sock_init(struct sonode *so)
358{
359	sodirect_t	*sodp;
360
361	ASSERT(so->so_direct == NULL);
362
363	so->so_state |= SS_SODIRECT;
364
365	sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP);
366	sodp->sod_enabled = B_TRUE;
367	sodp->sod_uioafh = NULL;
368	sodp->sod_uioaft = NULL;
369	/*
370	 * Remainder of the sod_uioa members are left uninitialized
371	 * but will be initialized later by uioainit() before uioa
372	 * is enabled.
373	 */
374	sodp->sod_uioa.uioa_state = UIOA_ALLOC;
375	so->so_direct = sodp;
376}
377
378void
379sod_sock_fini(struct sonode *so)
380{
381	sodirect_t *sodp = so->so_direct;
382
383	ASSERT(sodp->sod_uioafh == NULL);
384
385	so->so_direct = NULL;
386	kmem_cache_free(sock_sod_cache, sodp);
387}
388
389/*
390 * Init the sodirect kmem cache while sockfs is loading.
391 */
392int
393sod_init()
394{
395	/* Allocate sodirect_t kmem_cache */
396	sock_sod_cache = kmem_cache_create("sock_sod_cache",
397	    sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
398
399	return (0);
400}
401
402ssize_t
403sod_uioa_mblk(struct sonode *so, mblk_t *mp)
404{
405	sodirect_t *sodp = so->so_direct;
406
407	ASSERT(sodp != NULL);
408	ASSERT(MUTEX_HELD(&so->so_lock));
409
410	ASSERT(sodp->sod_enabled);
411	ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT));
412
413	ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI));
414
415	if (mp == NULL && so->so_rcv_q_head != NULL) {
416		mp = so->so_rcv_q_head;
417		ASSERT(mp->b_prev != NULL);
418		mp->b_prev = NULL;
419		so->so_rcv_q_head = mp->b_next;
420		if (so->so_rcv_q_head == NULL) {
421			so->so_rcv_q_last_head = NULL;
422		}
423		mp->b_next = NULL;
424	}
425
426	sod_uioa_mblk_done(sodp, mp);
427
428	if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL &&
429	    DB_TYPE(so->so_rcv_head) == M_DATA &&
430	    (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) {
431		/* more arrived */
432		ASSERT(so->so_rcv_q_head == NULL);
433		mp = so->so_rcv_head;
434		so->so_rcv_head = mp->b_next;
435		if (so->so_rcv_head == NULL)
436			so->so_rcv_last_head = NULL;
437		mp->b_prev = mp->b_next = NULL;
438		sod_uioa_mblk_done(sodp, mp);
439	}
440
441#ifdef DEBUG
442	if (so->so_rcv_q_head != NULL) {
443		mblk_t *m = so->so_rcv_q_head;
444		while (m != NULL) {
445			if (DB_FLAGS(m) & DBLK_UIOA) {
446				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
447				    " in so_rcv_q_head.\n", (void *)m);
448			}
449			m = m->b_next;
450		}
451	}
452	if (so->so_rcv_head != NULL) {
453		mblk_t *m = so->so_rcv_head;
454		while (m != NULL) {
455			if (DB_FLAGS(m) & DBLK_UIOA) {
456				cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
457				    " in so_rcv_head.\n", (void *)m);
458			}
459			m = m->b_next;
460		}
461	}
462#endif
463	return (sodp->sod_uioa.uioa_mbytes);
464}
465