move.c revision 8348:4137e18bfaf0
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27/*	  All Rights Reserved  	*/
28
29/*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39#include <sys/types.h>
40#include <sys/sysmacros.h>
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/uio.h>
44#include <sys/errno.h>
45#include <sys/vmsystm.h>
46#include <sys/cmn_err.h>
47#include <vm/as.h>
48#include <vm/page.h>
49
50#include <sys/dcopy.h>
51
52int64_t uioa_maxpoll = -1;	/* <0 = noblock, 0 = block, >0 = block after */
53#define	UIO_DCOPY_CHANNEL	0
54#define	UIO_DCOPY_CMD		1
55
56/*
57 * Move "n" bytes at byte address "p"; "rw" indicates the direction
58 * of the move, and the I/O parameters are provided in "uio", which is
59 * update to reflect the data which was moved.  Returns 0 on success or
60 * a non-zero errno on failure.
61 */
62int
63uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio)
64{
65	struct iovec *iov;
66	ulong_t cnt;
67	int error;
68
69	while (n && uio->uio_resid) {
70		iov = uio->uio_iov;
71		cnt = MIN(iov->iov_len, n);
72		if (cnt == 0l) {
73			uio->uio_iov++;
74			uio->uio_iovcnt--;
75			continue;
76		}
77		switch (uio->uio_segflg) {
78
79		case UIO_USERSPACE:
80		case UIO_USERISPACE:
81			if (rw == UIO_READ) {
82				error = xcopyout_nta(p, iov->iov_base, cnt,
83				    (uio->uio_extflg & UIO_COPY_CACHED));
84			} else {
85				error = xcopyin_nta(iov->iov_base, p, cnt,
86				    (uio->uio_extflg & UIO_COPY_CACHED));
87			}
88
89			if (error)
90				return (error);
91			break;
92
93		case UIO_SYSSPACE:
94			if (rw == UIO_READ)
95				error = kcopy_nta(p, iov->iov_base, cnt,
96				    (uio->uio_extflg & UIO_COPY_CACHED));
97			else
98				error = kcopy_nta(iov->iov_base, p, cnt,
99				    (uio->uio_extflg & UIO_COPY_CACHED));
100			if (error)
101				return (error);
102			break;
103		}
104		iov->iov_base += cnt;
105		iov->iov_len -= cnt;
106		uio->uio_resid -= cnt;
107		uio->uio_loffset += cnt;
108		p = (caddr_t)p + cnt;
109		n -= cnt;
110	}
111	return (0);
112}
113
114/*
115 * Fault in the pages of the first n bytes specified by the uio structure.
116 * 1 byte in each page is touched and the uio struct is unmodified. Any
117 * error will terminate the process as this is only a best attempt to get
118 * the pages resident.
119 */
120void
121uio_prefaultpages(ssize_t n, struct uio *uio)
122{
123	struct iovec *iov;
124	ulong_t cnt, incr;
125	caddr_t p;
126	uint8_t tmp;
127	int iovcnt;
128
129	iov = uio->uio_iov;
130	iovcnt = uio->uio_iovcnt;
131
132	while ((n > 0) && (iovcnt > 0)) {
133		cnt = MIN(iov->iov_len, n);
134		if (cnt == 0) {
135			/* empty iov entry */
136			iov++;
137			iovcnt--;
138			continue;
139		}
140		n -= cnt;
141		/*
142		 * touch each page in this segment.
143		 */
144		p = iov->iov_base;
145		while (cnt) {
146			switch (uio->uio_segflg) {
147			case UIO_USERSPACE:
148			case UIO_USERISPACE:
149				if (fuword8(p, &tmp))
150					return;
151				break;
152			case UIO_SYSSPACE:
153				if (kcopy(p, &tmp, 1))
154					return;
155				break;
156			}
157			incr = MIN(cnt, PAGESIZE);
158			p += incr;
159			cnt -= incr;
160		}
161		/*
162		 * touch the last byte in case it straddles a page.
163		 */
164		p--;
165		switch (uio->uio_segflg) {
166		case UIO_USERSPACE:
167		case UIO_USERISPACE:
168			if (fuword8(p, &tmp))
169				return;
170			break;
171		case UIO_SYSSPACE:
172			if (kcopy(p, &tmp, 1))
173				return;
174			break;
175		}
176		iov++;
177		iovcnt--;
178	}
179}
180
181/*
182 * transfer a character value into the address space
183 * delineated by a uio and update fields within the
184 * uio for next character. Return 0 for success, EFAULT
185 * for error.
186 */
187int
188ureadc(int val, struct uio *uiop)
189{
190	struct iovec *iovp;
191	unsigned char c;
192
193	/*
194	 * first determine if uio is valid.  uiop should be
195	 * non-NULL and the resid count > 0.
196	 */
197	if (!(uiop && uiop->uio_resid > 0))
198		return (EFAULT);
199
200	/*
201	 * scan through iovecs until one is found that is non-empty.
202	 * Return EFAULT if none found.
203	 */
204	while (uiop->uio_iovcnt > 0) {
205		iovp = uiop->uio_iov;
206		if (iovp->iov_len <= 0) {
207			uiop->uio_iovcnt--;
208			uiop->uio_iov++;
209		} else
210			break;
211	}
212
213	if (uiop->uio_iovcnt <= 0)
214		return (EFAULT);
215
216	/*
217	 * Transfer character to uio space.
218	 */
219
220	c = (unsigned char) (val & 0xFF);
221
222	switch (uiop->uio_segflg) {
223
224	case UIO_USERISPACE:
225	case UIO_USERSPACE:
226		if (copyout(&c, iovp->iov_base, sizeof (unsigned char)))
227			return (EFAULT);
228		break;
229
230	case UIO_SYSSPACE: /* can do direct copy since kernel-kernel */
231		*iovp->iov_base = c;
232		break;
233
234	default:
235		return (EFAULT); /* invalid segflg value */
236	}
237
238	/*
239	 * bump up/down iovec and uio members to reflect transfer.
240	 */
241	iovp->iov_base++;
242	iovp->iov_len--;
243	uiop->uio_resid--;
244	uiop->uio_loffset++;
245	return (0); /* success */
246}
247
248/*
249 * return a character value from the address space
250 * delineated by a uio and update fields within the
251 * uio for next character. Return the character for success,
252 * -1 for error.
253 */
254int
255uwritec(struct uio *uiop)
256{
257	struct iovec *iovp;
258	unsigned char c;
259
260	/*
261	 * verify we were passed a valid uio structure.
262	 * (1) non-NULL uiop, (2) positive resid count
263	 * (3) there is an iovec with positive length
264	 */
265
266	if (!(uiop && uiop->uio_resid > 0))
267		return (-1);
268
269	while (uiop->uio_iovcnt > 0) {
270		iovp = uiop->uio_iov;
271		if (iovp->iov_len <= 0) {
272			uiop->uio_iovcnt--;
273			uiop->uio_iov++;
274		} else
275			break;
276	}
277
278	if (uiop->uio_iovcnt <= 0)
279		return (-1);
280
281	/*
282	 * Get the character from the uio address space.
283	 */
284	switch (uiop->uio_segflg) {
285
286	case UIO_USERISPACE:
287	case UIO_USERSPACE:
288		if (copyin(iovp->iov_base, &c, sizeof (unsigned char)))
289			return (-1);
290		break;
291
292	case UIO_SYSSPACE:
293		c = *iovp->iov_base;
294		break;
295
296	default:
297		return (-1); /* invalid segflg */
298	}
299
300	/*
301	 * Adjust fields of iovec and uio appropriately.
302	 */
303	iovp->iov_base++;
304	iovp->iov_len--;
305	uiop->uio_resid--;
306	uiop->uio_loffset++;
307	return ((int)c & 0xFF); /* success */
308}
309
310/*
311 * Drop the next n chars out of *uiop.
312 */
313void
314uioskip(uio_t *uiop, size_t n)
315{
316	if (n > uiop->uio_resid)
317		return;
318	while (n != 0) {
319		register iovec_t	*iovp = uiop->uio_iov;
320		register size_t		niovb = MIN(iovp->iov_len, n);
321
322		if (niovb == 0) {
323			uiop->uio_iov++;
324			uiop->uio_iovcnt--;
325			continue;
326		}
327		iovp->iov_base += niovb;
328		uiop->uio_loffset += niovb;
329		iovp->iov_len -= niovb;
330		uiop->uio_resid -= niovb;
331		n -= niovb;
332	}
333}
334
335/*
336 * Dup the suio into the duio and diovec of size diov_cnt. If diov
337 * is too small to dup suio then an error will be returned, else 0.
338 */
339int
340uiodup(uio_t *suio, uio_t *duio, iovec_t *diov, int diov_cnt)
341{
342	int ix;
343	iovec_t *siov = suio->uio_iov;
344
345	*duio = *suio;
346	for (ix = 0; ix < suio->uio_iovcnt; ix++) {
347		diov[ix] = siov[ix];
348		if (ix >= diov_cnt)
349			return (1);
350	}
351	duio->uio_iov = diov;
352	return (0);
353}
354
355/*
356 * Shadow state for checking if a platform has hardware asynchronous
357 * copy capability and minimum copy size, e.g. Intel's I/OAT dma engine,
358 *
359 * Dcopy does a call-back to uioa_dcopy_enable() when a dma device calls
360 * into dcopy to register and uioa_dcopy_disable() when the device calls
361 * into dcopy to unregister.
362 */
363uioasync_t uioasync = {B_FALSE, 1024};
364
365void
366uioa_dcopy_enable()
367{
368	uioasync.enabled = B_TRUE;
369}
370
371void
372uioa_dcopy_disable()
373{
374	uioasync.enabled = B_FALSE;
375}
376
377/*
378 * Schedule an asynchronous move of "n" bytes at byte address "p",
379 * "rw" indicates the direction of the move, I/O parameters and
380 * async state are provided in "uioa" which is update to reflect
381 * the data which is to be moved.
382 *
383 * Returns 0 on success or a non-zero errno on failure.
384 *
385 * Note, while the uioasync APIs are general purpose in design
386 * the current implementation is Intel I/OAT specific.
387 */
388int
389uioamove(void *p, size_t n, enum uio_rw rw, uioa_t *uioa)
390{
391	int		soff, doff;
392	uint64_t	pa;
393	int		cnt;
394	iovec_t		*iov;
395	dcopy_handle_t	channel;
396	dcopy_cmd_t	cmd;
397	int		ret = 0;
398	int		dcopy_flags;
399
400	if (!(uioa->uioa_state & UIOA_ENABLED)) {
401		/* The uioa_t isn't enabled */
402		return (ENXIO);
403	}
404
405	if (uioa->uio_segflg != UIO_USERSPACE || rw != UIO_READ) {
406		/* Only support to user-land from kernel */
407		return (ENOTSUP);
408	}
409
410
411	channel = uioa->uioa_hwst[UIO_DCOPY_CHANNEL];
412	cmd = uioa->uioa_hwst[UIO_DCOPY_CMD];
413	dcopy_flags = DCOPY_NOSLEEP;
414
415	/*
416	 * While source bytes and destination bytes.
417	 */
418	while (n > 0 && uioa->uio_resid > 0) {
419		iov = uioa->uio_iov;
420		if (iov->iov_len == 0l) {
421			uioa->uio_iov++;
422			uioa->uio_iovcnt--;
423			uioa->uioa_lcur++;
424			uioa->uioa_lppp = uioa->uioa_lcur->uioa_ppp;
425			continue;
426		}
427		/*
428		 * While source bytes schedule an async
429		 * dma for destination page by page.
430		 */
431		while (n > 0) {
432			/* Addr offset in page src/dst */
433			soff = (uintptr_t)p & PAGEOFFSET;
434			doff = (uintptr_t)iov->iov_base & PAGEOFFSET;
435			/* Min copy count src and dst and page sized */
436			cnt = MIN(n, iov->iov_len);
437			cnt = MIN(cnt, PAGESIZE - soff);
438			cnt = MIN(cnt, PAGESIZE - doff);
439			/* XXX if next page(s) contiguous could use multipage */
440
441			/*
442			 * if we have an old command, we want to link all
443			 * other commands to the next command we alloced so
444			 * we only need to track the last command but can
445			 * still free them all.
446			 */
447			if (cmd != NULL) {
448				dcopy_flags |= DCOPY_ALLOC_LINK;
449			}
450			ret = dcopy_cmd_alloc(channel, dcopy_flags, &cmd);
451			if (ret != DCOPY_SUCCESS) {
452				/* Error of some sort */
453				return (EIO);
454			}
455			uioa->uioa_hwst[UIO_DCOPY_CMD] = cmd;
456
457			ASSERT(cmd->dp_version == DCOPY_CMD_V0);
458			if (uioa_maxpoll >= 0) {
459				/* Blocking (>0 may be) used in uioafini() */
460				cmd->dp_flags = DCOPY_CMD_INTR;
461			} else {
462				/* Non blocking uioafini() so no intr */
463				cmd->dp_flags = DCOPY_CMD_NOFLAGS;
464			}
465			cmd->dp_cmd = DCOPY_CMD_COPY;
466			pa = ptob((uint64_t)hat_getpfnum(kas.a_hat, p));
467			cmd->dp.copy.cc_source = pa + soff;
468			if (uioa->uioa_lcur->uioa_pfncnt == 0) {
469				/* Have a (page_t **) */
470				pa = ptob((uint64_t)(
471				    *(page_t **)uioa->uioa_lppp)->p_pagenum);
472			} else {
473				/* Have a (pfn_t *) */
474				pa = ptob((uint64_t)(
475				    *(pfn_t *)uioa->uioa_lppp));
476			}
477			cmd->dp.copy.cc_dest = pa + doff;
478			cmd->dp.copy.cc_size = cnt;
479			ret = dcopy_cmd_post(cmd);
480			if (ret != DCOPY_SUCCESS) {
481				/* Error of some sort */
482				return (EIO);
483			}
484			ret = 0;
485
486			/* If UIOA_POLL not set, set it */
487			if (!(uioa->uioa_state & UIOA_POLL))
488				uioa->uioa_state |= UIOA_POLL;
489
490			/* Update iov, uio, and local pointers/counters */
491			iov->iov_base += cnt;
492			iov->iov_len -= cnt;
493			uioa->uio_resid -= cnt;
494			uioa->uioa_mbytes += cnt;
495			uioa->uio_loffset += cnt;
496			p = (caddr_t)p + cnt;
497			n -= cnt;
498
499			/* End of iovec? */
500			if (iov->iov_len == 0) {
501				/* Yup, next iovec */
502				break;
503			}
504
505			/* Next dst addr page? */
506			if (doff + cnt == PAGESIZE) {
507				/* Yup, next page_t */
508				uioa->uioa_lppp++;
509			}
510		}
511	}
512
513	return (ret);
514}
515
516/*
517 * Initialize a uioa_t for a given uio_t for the current user context,
518 * copy the common uio_t to the uioa_t, walk the shared iovec_t and
519 * lock down the user-land page(s) containing iovec_t data, then mapin
520 * user-land pages using segkpm.
521 */
522int
523uioainit(uio_t *uiop, uioa_t *uioap)
524{
525	caddr_t	addr;
526	page_t		**pages;
527	int		off;
528	int		len;
529	proc_t		*procp = ttoproc(curthread);
530	struct as	*as = procp->p_as;
531	iovec_t		*iov = uiop->uio_iov;
532	int32_t		iovcnt = uiop->uio_iovcnt;
533	uioa_page_t	*locked = uioap->uioa_locked;
534	dcopy_handle_t	channel;
535	int		error;
536
537	if (! (uioap->uioa_state & UIOA_ALLOC)) {
538		/* Can only init() a freshly allocated uioa_t */
539		return (EINVAL);
540	}
541
542	error = dcopy_alloc(DCOPY_NOSLEEP, &channel);
543	if (error == DCOPY_NORESOURCES) {
544		/* Turn off uioa */
545		uioasync.enabled = B_FALSE;
546		return (ENODEV);
547	}
548	if (error != DCOPY_SUCCESS) {
549		/* Alloc failed */
550		return (EIO);
551	}
552
553	uioap->uioa_hwst[UIO_DCOPY_CHANNEL] = channel;
554	uioap->uioa_hwst[UIO_DCOPY_CMD] = NULL;
555
556	/* Indicate uioa_t (will be) initialized */
557	uioap->uioa_state = UIOA_INIT;
558
559	uioap->uioa_mbytes = 0;
560
561	/* uio_t/uioa_t uio_t common struct copy */
562	*((uio_t *)uioap) = *uiop;
563
564	/* initialize *uiop->uio_iov */
565	if (iovcnt > UIOA_IOV_MAX) {
566		/* Too big? */
567		return (E2BIG);
568	}
569	uioap->uio_iov = iov;
570	uioap->uio_iovcnt = iovcnt;
571
572	/* Mark the uioap as such */
573	uioap->uio_extflg |= UIO_ASYNC;
574
575	/*
576	 * For each iovec_t, lock-down the page(s) backing the iovec_t
577	 * and save the page_t list for phys addr use in uioamove().
578	 */
579	iov = uiop->uio_iov;
580	iovcnt = uiop->uio_iovcnt;
581	while (iovcnt > 0) {
582		addr = iov->iov_base;
583		off = (uintptr_t)addr & PAGEOFFSET;
584		addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
585		len = iov->iov_len + off;
586
587		/* Lock down page(s) for the iov span */
588		if ((error = as_pagelock(as, &pages,
589		    iov->iov_base, iov->iov_len, S_WRITE)) != 0) {
590			/* Error */
591			goto cleanup;
592		}
593
594		if (pages == NULL) {
595			/*
596			 * Need page_t list, really only need
597			 * a pfn list so build one.
598			 */
599			pfn_t   *pfnp;
600			int	pcnt = len >> PAGESHIFT;
601
602			if (off)
603				pcnt++;
604			if ((pfnp = kmem_alloc(pcnt * sizeof (pfnp),
605			    KM_NOSLEEP)) == NULL) {
606				error = ENOMEM;
607				goto cleanup;
608			}
609			locked->uioa_ppp = (void **)pfnp;
610			locked->uioa_pfncnt = pcnt;
611			AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
612			while (pcnt-- > 0) {
613				*pfnp++ = hat_getpfnum(as->a_hat, addr);
614				addr += PAGESIZE;
615			}
616			AS_LOCK_EXIT(as, &as->a_lock);
617		} else {
618			/* Have a page_t list, save it */
619			locked->uioa_ppp = (void **)pages;
620			locked->uioa_pfncnt = 0;
621		}
622		/* Save for as_pageunlock() in uioafini() */
623		locked->uioa_base = iov->iov_base;
624		locked->uioa_len = iov->iov_len;
625		locked++;
626
627		/* Next iovec_t */
628		iov++;
629		iovcnt--;
630	}
631	/* Initialize curret pointer into uioa_locked[] and it's uioa_ppp */
632	uioap->uioa_lcur = uioap->uioa_locked;
633	uioap->uioa_lppp = uioap->uioa_lcur->uioa_ppp;
634	return (0);
635
636cleanup:
637	/* Unlock any previously locked page_t(s) */
638	while (locked > uioap->uioa_locked) {
639		locked--;
640		as_pageunlock(as, (page_t **)locked->uioa_ppp,
641		    locked->uioa_base, locked->uioa_len, S_WRITE);
642	}
643
644	/* Last indicate uioa_t still in alloc state */
645	uioap->uioa_state = UIOA_ALLOC;
646	uioap->uioa_mbytes = 0;
647
648	return (error);
649}
650
651/*
652 * Finish processing of a uioa_t by cleanup any pending "uioap" actions.
653 */
654int
655uioafini(uio_t *uiop, uioa_t *uioap)
656{
657	int32_t		iovcnt = uiop->uio_iovcnt;
658	uioa_page_t	*locked = uioap->uioa_locked;
659	struct as	*as = ttoproc(curthread)->p_as;
660	dcopy_handle_t	channel;
661	dcopy_cmd_t	cmd;
662	int		ret = 0;
663
664	ASSERT(uioap->uio_extflg & UIO_ASYNC);
665
666	if (!(uioap->uioa_state & (UIOA_ENABLED|UIOA_FINI))) {
667		/* Must be an active uioa_t */
668		return (EINVAL);
669	}
670
671	channel = uioap->uioa_hwst[UIO_DCOPY_CHANNEL];
672	cmd = uioap->uioa_hwst[UIO_DCOPY_CMD];
673
674	/* XXX - why do we get cmd == NULL sometimes? */
675	if (cmd != NULL) {
676		if (uioap->uioa_state & UIOA_POLL) {
677			/* Wait for last dcopy() to finish */
678			int64_t poll = 1;
679			int poll_flag = DCOPY_POLL_NOFLAGS;
680
681			do {
682				if (uioa_maxpoll == 0 ||
683				    (uioa_maxpoll > 0 &&
684				    poll >= uioa_maxpoll)) {
685					/* Always block or after maxpoll */
686					poll_flag = DCOPY_POLL_BLOCK;
687				} else {
688					/* No block, poll */
689					poll++;
690				}
691				ret = dcopy_cmd_poll(cmd, poll_flag);
692			} while (ret == DCOPY_PENDING);
693
694			if (ret == DCOPY_COMPLETED) {
695				/* Poll/block succeeded */
696				ret = 0;
697			} else {
698				/* Poll/block failed */
699				ret = EIO;
700			}
701		}
702		dcopy_cmd_free(&cmd);
703	}
704
705	dcopy_free(&channel);
706
707	/* Unlock all page(s) iovec_t by iovec_t */
708	while (iovcnt-- > 0) {
709		page_t **pages;
710
711		if (locked->uioa_pfncnt == 0) {
712			/* A as_pagelock() returned (page_t **) */
713			pages = (page_t **)locked->uioa_ppp;
714		} else {
715			/* Our pfn_t array */
716			pages = NULL;
717			kmem_free(locked->uioa_ppp, locked->uioa_pfncnt *
718			    sizeof (pfn_t *));
719		}
720		as_pageunlock(as, pages, locked->uioa_base, locked->uioa_len,
721		    S_WRITE);
722
723		locked++;
724	}
725	/* uioa_t->uio_t common struct copy */
726	*uiop = *((uio_t *)uioap);
727
728	/*
729	 * Last, reset uioa state to alloc.
730	 *
731	 * Note, we only initialize the state here, all other members
732	 * will be initialized in a subsequent uioainit().
733	 */
734	uioap->uioa_state = UIOA_ALLOC;
735	uioap->uioa_mbytes = 0;
736
737	uioap->uioa_hwst[UIO_DCOPY_CMD] = NULL;
738	uioap->uioa_hwst[UIO_DCOPY_CHANNEL] = NULL;
739
740	return (ret);
741}
742