subr_sglist.c revision 193260
1/*-
2 * Copyright (c) 2008 Yahoo!, Inc.
3 * All rights reserved.
4 * Written by: John Baldwin <jhb@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Neither the name of the author nor the names of any co-contributors
15 *    may be used to endorse or promote products derived from this software
16 *    without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD: head/sys/kern/subr_sglist.c 193260 2009-06-01 20:35:39Z jhb $");
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/malloc.h>
37#include <sys/mbuf.h>
38#include <sys/proc.h>
39#include <sys/sglist.h>
40#include <sys/uio.h>
41
42#include <vm/vm.h>
43#include <vm/pmap.h>
44#include <vm/vm_map.h>
45
46#include <sys/ktr.h>
47
48static MALLOC_DEFINE(M_SGLIST, "sglist", "scatter/gather lists");
49
50/*
51 * Append a single (paddr, len) to a sglist.  sg is the list and ss is
52 * the current segment in the list.  If we run out of segments then
53 * EFBIG will be returned.
54 */
55static __inline int
56_sglist_append_range(struct sglist *sg, struct sglist_seg **ssp,
57    vm_paddr_t paddr, size_t len)
58{
59	struct sglist_seg *ss;
60
61	ss = *ssp;
62	if (ss->ss_paddr + ss->ss_len == paddr)
63		ss->ss_len += len;
64	else {
65		if (sg->sg_nseg == sg->sg_maxseg) {
66			sg->sg_nseg = 0;
67			return (EFBIG);
68		}
69		ss++;
70		ss->ss_paddr = paddr;
71		ss->ss_len = len;
72		sg->sg_nseg++;
73		*ssp = ss;
74	}
75	return (0);
76}
77
78/*
79 * Worker routine to append a virtual address range (either kernel or
80 * user) to a scatter/gather list.
81 */
82static __inline int
83_sglist_append_buf(struct sglist *sg, void *buf, size_t len, pmap_t pmap,
84    size_t *donep)
85{
86	struct sglist_seg *ss;
87	vm_offset_t vaddr, offset;
88	vm_paddr_t paddr;
89	size_t seglen;
90	int error;
91
92	if (donep)
93		*donep = 0;
94	if (len == 0)
95		return (0);
96
97	/* Do the first page.  It may have an offset. */
98	vaddr = (vm_offset_t)buf;
99	offset = vaddr & PAGE_MASK;
100	if (pmap != NULL)
101		paddr = pmap_extract(pmap, vaddr);
102	else
103		paddr = pmap_kextract(vaddr);
104	seglen = MIN(len, PAGE_SIZE - offset);
105	if (sg->sg_nseg == 0) {
106		ss = sg->sg_segs;
107		ss->ss_paddr = paddr;
108		ss->ss_len = seglen;
109		sg->sg_nseg = 1;
110		error = 0;
111	} else {
112		ss = &sg->sg_segs[sg->sg_nseg - 1];
113		error = _sglist_append_range(sg, &ss, paddr, seglen);
114	}
115
116	while (error == 0 && len > seglen) {
117		vaddr += seglen;
118		len -= seglen;
119		if (donep)
120			*donep += seglen;
121		seglen = MIN(len, PAGE_SIZE);
122		if (pmap != NULL)
123			paddr = pmap_extract(pmap, vaddr);
124		else
125			paddr = pmap_kextract(vaddr);
126		error = _sglist_append_range(sg, &ss, paddr, seglen);
127	}
128
129	return (error);
130}
131
132/*
133 * Determine the number of scatter/gather list elements needed to
134 * describe a kernel virtual address range.
135 */
136int
137sglist_count(void *buf, size_t len)
138{
139	vm_offset_t vaddr, vendaddr;
140	vm_paddr_t lastaddr, paddr;
141	int nsegs;
142
143	if (len == 0)
144		return (0);
145
146	vaddr = trunc_page((vm_offset_t)buf);
147	vendaddr = (vm_offset_t)buf + len;
148	nsegs = 1;
149	lastaddr = pmap_kextract(vaddr);
150	vaddr += PAGE_SIZE;
151	while (vaddr < vendaddr) {
152		paddr = pmap_kextract(vaddr);
153		if (lastaddr + PAGE_SIZE != paddr)
154			nsegs++;
155		lastaddr = paddr;
156		vaddr += PAGE_SIZE;
157	}
158	return (nsegs);
159}
160
161/*
162 * Allocate a scatter/gather list along with 'nsegs' segments.  The
163 * 'mflags' parameters are the same as passed to malloc(9).  The caller
164 * should use sglist_free() to free this list.
165 */
166struct sglist *
167sglist_alloc(int nsegs, int mflags)
168{
169	struct sglist *sg;
170
171	sg = malloc(sizeof(struct sglist) + nsegs * sizeof(struct sglist_seg),
172	    M_SGLIST, mflags);
173	if (sg == NULL)
174		return (NULL);
175	sglist_init(sg, nsegs, (struct sglist_seg *)(sg + 1));
176	return (sg);
177}
178
179/*
180 * Free a scatter/gather list allocated via sglist_allc().
181 */
182void
183sglist_free(struct sglist *sg)
184{
185
186	if (refcount_release(&sg->sg_refs))
187		free(sg, M_SGLIST);
188}
189
190/*
191 * Append the segments to describe a single kernel virtual address
192 * range to a scatter/gather list.  If there are insufficient
193 * segments, then this fails with EFBIG.
194 */
195int
196sglist_append(struct sglist *sg, void *buf, size_t len)
197{
198
199	if (sg->sg_maxseg == 0)
200		return (EINVAL);
201	return (_sglist_append_buf(sg, buf, len, NULL, NULL));
202}
203
204/*
205 * Append a single physical address range to a scatter/gather list.
206 * If there are insufficient segments, then this fails with EFBIG.
207 */
208int
209sglist_append_phys(struct sglist *sg, vm_paddr_t paddr, size_t len)
210{
211	struct sglist_seg *ss;
212
213	if (sg->sg_maxseg == 0)
214		return (EINVAL);
215	if (len == 0)
216		return (0);
217
218	if (sg->sg_nseg == 0) {
219		sg->sg_segs[0].ss_paddr = paddr;
220		sg->sg_segs[0].ss_len = len;
221		sg->sg_nseg = 1;
222		return (0);
223	}
224	ss = &sg->sg_segs[sg->sg_nseg - 1];
225	return (_sglist_append_range(sg, &ss, paddr, len));
226}
227
228/*
229 * Append the segments that describe a single mbuf chain to a
230 * scatter/gather list.  If there are insufficient segments, then this
231 * fails with EFBIG.
232 */
233int
234sglist_append_mbuf(struct sglist *sg, struct mbuf *m0)
235{
236	struct mbuf *m;
237	int error;
238
239	if (sg->sg_maxseg == 0)
240		return (EINVAL);
241
242	error = 0;
243	for (m = m0; m != NULL; m = m->m_next) {
244		if (m->m_len > 0) {
245			error = sglist_append(sg, m->m_data, m->m_len);
246			if (error)
247				return (error);
248		}
249	}
250	return (0);
251}
252
253/*
254 * Append the segments that describe a single user address range to a
255 * scatter/gather list.  If there are insufficient segments, then this
256 * fails with EFBIG.
257 */
258int
259sglist_append_user(struct sglist *sg, void *buf, size_t len, struct thread *td)
260{
261
262	if (sg->sg_maxseg == 0)
263		return (EINVAL);
264	return (_sglist_append_buf(sg, buf, len,
265	    vmspace_pmap(td->td_proc->p_vmspace), NULL));
266}
267
268/*
269 * Append the segments that describe a single uio to a scatter/gather
270 * list.  If there are insufficient segments, then this fails with
271 * EFBIG.
272 */
273int
274sglist_append_uio(struct sglist *sg, struct uio *uio)
275{
276	struct iovec *iov;
277	size_t resid, minlen;
278	pmap_t pmap;
279	int error, i;
280
281	if (sg->sg_maxseg == 0)
282		return (EINVAL);
283
284	resid = uio->uio_resid;
285	iov = uio->uio_iov;
286
287	if (uio->uio_segflg == UIO_USERSPACE) {
288		KASSERT(uio->uio_td != NULL,
289		    ("sglist_append_uio: USERSPACE but no thread"));
290		pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
291	} else
292		pmap = NULL;
293
294	error = 0;
295	for (i = 0; i < uio->uio_iovcnt && resid != 0; i++) {
296		/*
297		 * Now at the first iovec to load.  Load each iovec
298		 * until we have exhausted the residual count.
299		 */
300		minlen = MIN(resid, iov[i].iov_len);
301		if (minlen > 0) {
302			error = _sglist_append_buf(sg, iov[i].iov_base, minlen,
303			    pmap, NULL);
304			if (error)
305				return (error);
306			resid -= minlen;
307		}
308	}
309	return (0);
310}
311
312/*
313 * Append the segments that describe at most 'resid' bytes from a
314 * single uio to a scatter/gather list.  If there are insufficient
315 * segments, then only the amount that fits is appended.
316 */
317int
318sglist_consume_uio(struct sglist *sg, struct uio *uio, int resid)
319{
320	struct iovec *iov;
321	size_t done;
322	pmap_t pmap;
323	int error, len;
324
325	if (sg->sg_maxseg == 0)
326		return (EINVAL);
327
328	if (uio->uio_segflg == UIO_USERSPACE) {
329		KASSERT(uio->uio_td != NULL,
330		    ("sglist_consume_uio: USERSPACE but no thread"));
331		pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
332	} else
333		pmap = NULL;
334
335	error = 0;
336	while (resid > 0 && uio->uio_resid) {
337		iov = uio->uio_iov;
338		len = iov->iov_len;
339		if (len == 0) {
340			uio->uio_iov++;
341			uio->uio_iovcnt--;
342			continue;
343		}
344		if (len > resid)
345			len = resid;
346
347		/*
348		 * Try to append this iovec.  If we run out of room,
349		 * then break out of the loop.
350		 */
351		error = _sglist_append_buf(sg, iov->iov_base, len, pmap, &done);
352		iov->iov_base = (char *)iov->iov_base + done;
353		iov->iov_len -= done;
354		uio->uio_resid -= done;
355		uio->uio_offset += done;
356		resid -= done;
357		if (error)
358			break;
359	}
360	return (0);
361}
362
363/*
364 * Allocate and populate a scatter/gather list to describe a single
365 * kernel virtual address range.
366 */
367struct sglist *
368sglist_build(void *buf, size_t len, int mflags)
369{
370	struct sglist *sg;
371	int nsegs;
372
373	if (len == 0)
374		return (NULL);
375
376	nsegs = sglist_count(buf, len);
377	sg = sglist_alloc(nsegs, mflags);
378	if (sg == NULL)
379		return (NULL);
380	if (sglist_append(sg, buf, len) != 0) {
381		sglist_free(sg);
382		return (NULL);
383	}
384	return (sg);
385}
386
387/*
388 * Clone a new copy of a scatter/gather list.
389 */
390struct sglist *
391sglist_clone(struct sglist *sg, int mflags)
392{
393	struct sglist *new;
394
395	if (sg == NULL)
396		return (NULL);
397	new = sglist_alloc(sg->sg_maxseg, mflags);
398	if (new == NULL)
399		return (NULL);
400	bcopy(sg->sg_segs, new->sg_segs, sizeof(struct sglist_seg) *
401	    sg->sg_nseg);
402	return (new);
403}
404
405/*
406 * Calculate the total length of the segments described in a
407 * scatter/gather list.
408 */
409size_t
410sglist_length(struct sglist *sg)
411{
412	size_t space;
413	int i;
414
415	space = 0;
416	for (i = 0; i < sg->sg_nseg; i++)
417		space += sg->sg_segs[i].ss_len;
418	return (space);
419}
420
421/*
422 * Split a scatter/gather list into two lists.  The scatter/gather
423 * entries for the first 'length' bytes of the 'original' list are
424 * stored in the '*head' list and are removed from 'original'.
425 *
426 * If '*head' is NULL, then a new list will be allocated using
427 * 'mflags'.  If M_NOWAIT is specified and the allocation fails,
428 * ENOMEM will be returned.
429 *
430 * If '*head' is not NULL, it should point to an empty sglist.  If it
431 * does not have enough room for the remaining space, then EFBIG will
432 * be returned.  If '*head' is not empty, then EINVAL will be
433 * returned.
434 *
435 * If 'original' is shared (refcount > 1), then EDOOFUS will be
436 * returned.
437 */
438int
439sglist_split(struct sglist *original, struct sglist **head, size_t length,
440    int mflags)
441{
442	struct sglist *sg;
443	size_t space, split;
444	int count, i;
445
446	if (original->sg_refs > 1)
447		return (EDOOFUS);
448
449	/* Figure out how big of a sglist '*head' has to hold. */
450	count = 0;
451	space = 0;
452	split = 0;
453	for (i = 0; i < original->sg_nseg; i++) {
454		space += original->sg_segs[i].ss_len;
455		count++;
456		if (space >= length) {
457			/*
458			 * If 'length' falls in the middle of a
459			 * scatter/gather list entry, then 'split'
460			 * holds how much of that entry will remain in
461			 * 'original'.
462			 */
463			split = space - length;
464			break;
465		}
466	}
467
468	/* Nothing to do, so leave head empty. */
469	if (count == 0)
470		return (0);
471
472	if (*head == NULL) {
473		sg = sglist_alloc(count, mflags);
474		if (sg == NULL)
475			return (ENOMEM);
476		*head = sg;
477	} else {
478		sg = *head;
479		if (sg->sg_maxseg < count)
480			return (EFBIG);
481		if (sg->sg_nseg != 0)
482			return (EINVAL);
483	}
484
485	/* Copy 'count' entries to 'sg' from 'original'. */
486	bcopy(original->sg_segs, sg->sg_segs, count *
487	    sizeof(struct sglist_seg));
488	sg->sg_nseg = count;
489
490	/*
491	 * If we had to split a list entry, fixup the last entry in
492	 * 'sg' and the new first entry in 'original'.  We also
493	 * decrement 'count' by 1 since we will only be removing
494	 * 'count - 1' segments from 'original' now.
495	 */
496	if (split != 0) {
497		count--;
498		sg->sg_segs[count].ss_len -= split;
499		original->sg_segs[count].ss_paddr =
500		    sg->sg_segs[count].ss_paddr + split;
501		original->sg_segs[count].ss_len = split;
502	}
503
504	/* Trim 'count' entries from the front of 'original'. */
505	original->sg_nseg -= count;
506	bcopy(original->sg_segs + count, original->sg_segs, count *
507	    sizeof(struct sglist_seg));
508	return (0);
509}
510
511/*
512 * Append the scatter/gather list elements in 'second' to the
513 * scatter/gather list 'first'.  If there is not enough space in
514 * 'first', EFBIG is returned.
515 */
516int
517sglist_join(struct sglist *first, struct sglist *second)
518{
519	struct sglist_seg *flast, *sfirst;
520	int append;
521
522	/* If 'second' is empty, there is nothing to do. */
523	if (second->sg_nseg == 0)
524		return (0);
525
526	/*
527	 * If the first entry in 'second' can be appended to the last entry
528	 * in 'first' then set append to '1'.
529	 */
530	append = 0;
531	flast = &first->sg_segs[first->sg_nseg - 1];
532	sfirst = &second->sg_segs[0];
533	if (first->sg_nseg != 0 &&
534	    flast->ss_paddr + flast->ss_len == sfirst->ss_paddr)
535		append = 1;
536
537	/* Make sure 'first' has enough room. */
538	if (first->sg_nseg + second->sg_nseg - append > first->sg_maxseg)
539		return (EFBIG);
540
541	/* Merge last in 'first' and first in 'second' if needed. */
542	if (append)
543		flast->ss_len += sfirst->ss_len;
544
545	/* Append new segments from 'second' to 'first'. */
546	bcopy(first->sg_segs + first->sg_nseg, second->sg_segs + append,
547	    (second->sg_nseg - append) * sizeof(struct sglist_seg));
548	first->sg_nseg += second->sg_nseg - append;
549	sglist_reset(second);
550	return (0);
551}
552
553/*
554 * Generate a new scatter/gather list from a range of an existing
555 * scatter/gather list.  The 'offset' and 'length' parameters specify
556 * the logical range of the 'original' list to extract.  If that range
557 * is not a subset of the length of 'original', then EINVAL is
558 * returned.  The new scatter/gather list is stored in '*slice'.
559 *
560 * If '*slice' is NULL, then a new list will be allocated using
561 * 'mflags'.  If M_NOWAIT is specified and the allocation fails,
562 * ENOMEM will be returned.
563 *
564 * If '*slice' is not NULL, it should point to an empty sglist.  If it
565 * does not have enough room for the remaining space, then EFBIG will
566 * be returned.  If '*slice' is not empty, then EINVAL will be
567 * returned.
568 */
569int
570sglist_slice(struct sglist *original, struct sglist **slice, size_t offset,
571    size_t length, int mflags)
572{
573	struct sglist *sg;
574	size_t space, end, foffs, loffs;
575	int count, i, fseg;
576
577	/* Nothing to do. */
578	if (length == 0)
579		return (0);
580
581	/* Figure out how many segments '*slice' needs to have. */
582	end = offset + length;
583	space = 0;
584	count = 0;
585	fseg = 0;
586	foffs = loffs = 0;
587	for (i = 0; i < original->sg_nseg; i++) {
588		space += original->sg_segs[i].ss_len;
589		if (space > offset) {
590			/*
591			 * When we hit the first segment, store its index
592			 * in 'fseg' and the offset into the first segment
593			 * of 'offset' in 'foffs'.
594			 */
595			if (count == 0) {
596				fseg = i;
597				foffs = offset - (space -
598				    original->sg_segs[i].ss_len);
599				CTR1(KTR_DEV, "sglist_slice: foffs = %08lx",
600				    foffs);
601			}
602			count++;
603
604			/*
605			 * When we hit the last segment, break out of
606			 * the loop.  Store the amount of extra space
607			 * at the end of this segment in 'loffs'.
608			 */
609			if (space >= end) {
610				loffs = space - end;
611				CTR1(KTR_DEV, "sglist_slice: loffs = %08lx",
612				    loffs);
613				break;
614			}
615		}
616	}
617
618	/* If we never hit 'end', then 'length' ran off the end, so fail. */
619	if (space < end)
620		return (EINVAL);
621
622	if (*slice == NULL) {
623		sg = sglist_alloc(count, mflags);
624		if (sg == NULL)
625			return (ENOMEM);
626		*slice = sg;
627	} else {
628		sg = *slice;
629		if (sg->sg_maxseg < count)
630			return (EFBIG);
631		if (sg->sg_nseg != 0)
632			return (EINVAL);
633	}
634
635	/*
636	 * Copy over 'count' segments from 'original' starting at
637	 * 'fseg' to 'sg'.
638	 */
639	bcopy(original->sg_segs + fseg, sg->sg_segs,
640	    count * sizeof(struct sglist_seg));
641	sg->sg_nseg = count;
642
643	/* Fixup first and last segments if needed. */
644	if (foffs != 0) {
645		sg->sg_segs[0].ss_paddr += foffs;
646		sg->sg_segs[0].ss_len -= foffs;
647		CTR2(KTR_DEV, "sglist_slice seg[0]: %08lx:%08lx",
648		    (long)sg->sg_segs[0].ss_paddr, sg->sg_segs[0].ss_len);
649	}
650	if (loffs != 0) {
651		sg->sg_segs[count - 1].ss_len -= loffs;
652		CTR2(KTR_DEV, "sglist_slice seg[%d]: len %08x", count - 1,
653		    sg->sg_segs[count - 1].ss_len);
654	}
655	return (0);
656}
657