uipc_mvec.c revision 168886
1/**************************************************************************
2 *
3 * Copyright (c) 2007, Kip Macy kmacy@freebsd.org
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice,
10 *    this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 *
28 *
29 ***************************************************************************/
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD: head/sys/dev/cxgb/sys/uipc_mvec.c 168886 2007-04-20 05:06:02Z kmacy $");
33
34#include <sys/param.h>
35#include <sys/systm.h>
36#include <sys/kernel.h>
37#include <sys/lock.h>
38#include <sys/malloc.h>
39#include <sys/mbuf.h>
40#include <sys/ktr.h>
41#include <sys/sf_buf.h>
42
43#include <machine/bus.h>
44#include <dev/cxgb/sys/mvec.h>
45#include "opt_zero.h"
46
47#include <vm/vm.h>
48#include <vm/vm_page.h>
49#include <vm/pmap.h>
50
51#ifdef DEBUG
52#define DPRINTF printf
53#else
54#define DPRINTF(...)
55#endif
56
57#ifdef INVARIANTS
58#define M_SANITY m_sanity
59#else
60#define M_SANITY(a, b)
61#endif
62
63#define MAX_BUFS 36
64#define MAX_HVEC 8
65
66extern uint32_t collapse_free;
67extern uint32_t mb_free_vec_free;
68
69struct mbuf_ext {
70	struct mbuf    *me_m;
71	caddr_t         me_base;
72	volatile u_int *me_refcnt;
73	int             me_flags;
74	uint32_t        me_offset;
75};
76
77int
78_m_explode(struct mbuf *m)
79{
80        int i, offset, type, first, len;
81        uint8_t *cl;
82        struct mbuf *m0, *head = NULL;
83        struct mbuf_vec *mv;
84
85#ifdef INVARIANTS
86	len = m->m_len;
87	m0 = m->m_next;
88	while (m0) {
89		KASSERT((m0->m_flags & M_PKTHDR) == 0,
90		    ("pkthdr set on intermediate mbuf - pre"));
91		len += m0->m_len;
92		m0 = m0->m_next;
93
94	}
95	if (len != m->m_pkthdr.len)
96		panic("at start len=%d pktlen=%d", len, m->m_pkthdr.len);
97#endif
98        mv = mtomv(m);
99	first = mv->mv_first;
100        for (i = mv->mv_count + first - 1; i > first; i--) {
101		type = mbuf_vec_get_type(mv, i);
102                cl = mv->mv_vec[i].mi_base;
103		offset = mv->mv_vec[i].mi_offset;
104		len = mv->mv_vec[i].mi_len;
105		if (__predict_false(type == EXT_MBUF)) {
106			m0 = (struct mbuf *)cl;
107			KASSERT((m0->m_flags & M_EXT) == 0, ("M_EXT set on mbuf"));
108			m0->m_len = len;
109			m0->m_data = cl + offset;
110			goto skip_cluster;
111
112		} else if ((m0 = m_get(M_NOWAIT, MT_DATA)) == NULL) {
113			/*
114			 * Check for extra memory leaks
115			 */
116			m_freem(head);
117			return (ENOMEM);
118                }
119		m0->m_flags = 0;
120
121		m_cljset(m0, (uint8_t *)cl, type);
122		m0->m_len = mv->mv_vec[i].mi_len;
123		if (offset)
124			m_adj(m0, offset);
125	skip_cluster:
126		m0->m_next = head;
127		m->m_len -= m0->m_len;
128		head = m0;
129	}
130	offset = mv->mv_vec[first].mi_offset;
131	cl = mv->mv_vec[first].mi_base;
132	type = mbuf_vec_get_type(mv, first);
133	m->m_flags &= ~(M_IOVEC);
134	m_cljset(m, cl, type);
135	if (offset)
136		m_adj(m, offset);
137	m->m_next = head;
138	head = m;
139	M_SANITY(m, 0);
140
141	return (0);
142}
143
144static __inline int
145m_vectorize(struct mbuf *m, int max, struct mbuf **vec, int *count)
146{
147	int i, error = 0;
148
149	for (i = 0; i < max; i++) {
150		if (m == NULL)
151			break;
152#ifndef MBUF_PACKET_ZONE_DISABLE
153		if ((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_PACKET))
154			return (EINVAL);
155#endif
156#ifdef ZERO_COPY_SOCKETS
157		if ((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_SFBUF))
158			return (EINVAL);
159#endif
160		M_SANITY(m, 0);
161		vec[i] = m;
162		m = m->m_next;
163	}
164	if (m)
165		error = EFBIG;
166
167	*count = i;
168
169	return (error);
170}
171
172static __inline int
173m_findmbufs(struct mbuf **ivec, int maxbufs, struct mbuf_ext *ovec, int osize, int *ocount)
174{
175	int i, j, nhbufsneed, nhbufs;
176	struct mbuf *m;
177
178	nhbufsneed = min(((maxbufs - 1)/MAX_MBUF_IOV) + 1, osize);
179	ovec[0].me_m = NULL;
180
181	for (nhbufs = j = i = 0; i < maxbufs && nhbufs < nhbufsneed; i++) {
182		if ((ivec[i]->m_flags & M_EXT) == 0)
183			continue;
184		m = ivec[i];
185		ovec[nhbufs].me_m = m;
186		ovec[nhbufs].me_base = m->m_ext.ext_buf;
187		ovec[nhbufs].me_refcnt = m->m_ext.ref_cnt;
188		ovec[nhbufs].me_offset = (m->m_data - m->m_ext.ext_buf);
189		ovec[nhbufs].me_flags = m->m_ext.ext_type;
190		nhbufs++;
191	}
192	if (nhbufs == 0) {
193		if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
194			goto m_getfail;
195		ovec[nhbufs].me_m = m;
196		nhbufs = 1;
197	}
198	while (nhbufs < nhbufsneed) {
199		if ((m = m_get(M_NOWAIT, MT_DATA)) == NULL)
200			goto m_getfail;
201		ovec[nhbufs].me_m = m;
202		nhbufs++;
203	}
204	/*
205	 * Copy over packet header to new head of chain
206	 */
207	if (ovec[0].me_m != ivec[0]) {
208		ovec[0].me_m->m_flags |= M_PKTHDR;
209		memcpy(&ovec[0].me_m->m_pkthdr, &ivec[0]->m_pkthdr, sizeof(struct pkthdr));
210		SLIST_INIT(&ivec[0]->m_pkthdr.tags);
211	}
212	*ocount = nhbufs;
213	return (0);
214m_getfail:
215	for (i = 0; i < nhbufs; i++)
216		if ((ovec[i].me_m->m_flags & M_EXT) == 0)
217			uma_zfree(zone_mbuf, ovec[i].me_m);
218	return (ENOMEM);
219
220}
221
222static __inline void
223m_setiovec(struct mbuf_iovec *mi, struct mbuf *m, struct mbuf_ext *extvec, int *me_index,
224    int max_me_index)
225{
226	int idx = *me_index;
227
228	mi->mi_len = m->m_len;
229	if (idx < max_me_index && extvec[idx].me_m == m) {
230		struct mbuf_ext *me = &extvec[idx];
231		(*me_index)++;
232		mi->mi_base = me->me_base;
233		mi->mi_refcnt = me->me_refcnt;
234		mi->mi_offset = me->me_offset;
235		mi->mi_flags = me->me_flags;
236	} else if (m->m_flags & M_EXT) {
237		mi->mi_base = m->m_ext.ext_buf;
238		mi->mi_refcnt = m->m_ext.ref_cnt;
239		mi->mi_offset =
240		    (m->m_data - m->m_ext.ext_buf);
241		mi->mi_flags = m->m_ext.ext_type;
242	} else {
243		KASSERT(m->m_len < 256, ("mbuf too large len=%d",
244			m->m_len));
245		mi->mi_base = (uint8_t *)m;
246		mi->mi_refcnt = NULL;
247		mi->mi_offset =
248		    (m->m_data - (caddr_t)m);
249		mi->mi_flags = EXT_MBUF;
250	}
251	DPRINTF("type=%d len=%d refcnt=%p cl=%p offset=0x%x\n",
252	    mi->mi_flags, mi->mi_len, mi->mi_refcnt, mi->mi_base,
253	    mi->mi_offset);
254}
255
256int
257_m_collapse(struct mbuf *m, int maxbufs, struct mbuf **mnew)
258{
259	struct mbuf *m0, *lmvec[MAX_BUFS];
260	struct mbuf **mnext;
261	struct mbuf **vec = lmvec;
262	struct mbuf *mhead = NULL;
263	struct mbuf_vec *mv;
264	int err, i, j, max, len, nhbufs;
265	struct mbuf_ext dvec[MAX_HVEC];
266	int hidx = 0, dvecidx;
267
268	M_SANITY(m, 0);
269	if (maxbufs > MAX_BUFS) {
270		if ((vec = malloc(maxbufs * sizeof(struct mbuf *),
271			    M_DEVBUF, M_NOWAIT)) == NULL)
272			return (ENOMEM);
273	}
274
275	if ((err = m_vectorize(m, maxbufs, vec, &max)) != 0)
276		goto out;
277	if ((err = m_findmbufs(vec, max, dvec, MAX_HVEC, &nhbufs)) != 0)
278		goto out;
279
280	KASSERT(max > 0, ("invalid mbuf count"));
281	KASSERT(nhbufs > 0, ("invalid header mbuf count"));
282
283	mhead = m0 = dvec[0].me_m;
284
285	DPRINTF("nbufs=%d nhbufs=%d\n", max, nhbufs);
286	for (hidx = dvecidx = i = 0, mnext = NULL; i < max; hidx++) {
287		m0 = dvec[hidx].me_m;
288		m0->m_flags &= ~M_EXT;
289		m0->m_flags |= M_IOVEC;
290
291		if (mnext)
292			*mnext = m0;
293
294		mv = mtomv(m0);
295		len = mv->mv_first = 0;
296		for (j = 0; j < MAX_MBUF_IOV && i < max; j++, i++) {
297			struct mbuf_iovec *mi = &mv->mv_vec[j];
298
299			m_setiovec(mi, vec[i], dvec, &dvecidx, nhbufs);
300			len += mi->mi_len;
301		}
302		m0->m_data = mv->mv_vec[0].mi_base + mv->mv_vec[0].mi_offset;
303		mv->mv_count = j;
304		m0->m_len = len;
305		mnext = &m0->m_next;
306		DPRINTF("count=%d len=%d\n", j, len);
307	}
308
309	/*
310	 * Terminate chain
311	 */
312	m0->m_next = NULL;
313
314	/*
315	 * Free all mbufs not used by the mbuf iovec chain
316	 */
317	for (i = 0; i < max; i++)
318		if (vec[i]->m_flags & M_EXT) {
319			vec[i]->m_flags &= ~M_EXT;
320			collapse_free++;
321			uma_zfree(zone_mbuf, vec[i]);
322		}
323
324	*mnew = mhead;
325out:
326	if (vec != lmvec)
327		free(vec, M_DEVBUF);
328	return (err);
329}
330
331void
332mb_free_vec(struct mbuf *m)
333{
334	struct mbuf_vec *mv;
335	int i;
336
337	KASSERT((m->m_flags & (M_EXT|M_IOVEC)) == M_IOVEC,
338	    ("%s: M_EXT set", __func__));
339
340	mv = mtomv(m);
341	KASSERT(mv->mv_count <= MAX_MBUF_IOV,
342	    ("%s: mi_count too large %d", __func__, mv->mv_count));
343
344	DPRINTF("count=%d len=%d\n", mv->mv_count, m->m_len);
345	for (i = mv->mv_first; i < mv->mv_count; i++) {
346		uma_zone_t zone = NULL;
347		volatile int *refcnt = mv->mv_vec[i].mi_refcnt;
348		int type = mbuf_vec_get_type(mv, i);
349		void *cl = mv->mv_vec[i].mi_base;
350
351		if (refcnt && *refcnt != 1 && atomic_fetchadd_int(refcnt, -1) != 1)
352			continue;
353
354		DPRINTF("freeing idx=%d refcnt=%p type=%d cl=%p\n", i, refcnt, type, cl);
355		switch (type) {
356		case EXT_MBUF:
357			mb_free_vec_free++;
358		case EXT_CLUSTER:
359		case EXT_JUMBOP:
360		case EXT_JUMBO9:
361		case EXT_JUMBO16:
362			zone = m_getzonefromtype(type);
363			uma_zfree(zone, cl);
364			continue;
365		case EXT_SFBUF:
366			*refcnt = 0;
367			uma_zfree(zone_ext_refcnt, __DEVOLATILE(u_int *,
368				refcnt));
369#ifdef __i386__
370			sf_buf_mext(cl, mv->mv_vec[i].mi_args);
371#else
372			/*
373			 * Every architecture other than i386 uses a vm_page
374			 * for an sf_buf (well ... sparc64 does but shouldn't)
375			 */
376			sf_buf_mext(cl, PHYS_TO_VM_PAGE(vtophys(cl)));
377#endif
378			continue;
379		default:
380			KASSERT(m->m_ext.ext_type == 0,
381				("%s: unknown ext_type", __func__));
382			break;
383		}
384	}
385	/*
386	 * Free this mbuf back to the mbuf zone with all iovec
387	 * information purged.
388	 */
389	mb_free_vec_free++;
390	uma_zfree(zone_mbuf, m);
391}
392
393#if (!defined(__sparc64__) && !defined(__sun4v__))
394#include <sys/sysctl.h>
395
396#define BUS_DMA_COULD_BOUNCE	BUS_DMA_BUS3
397#define BUS_DMA_MIN_ALLOC_COMP	BUS_DMA_BUS4
398
399struct bounce_zone {
400	STAILQ_ENTRY(bounce_zone) links;
401	STAILQ_HEAD(bp_list, bounce_page) bounce_page_list;
402	int		total_bpages;
403	int		free_bpages;
404	int		reserved_bpages;
405	int		active_bpages;
406	int		total_bounced;
407	int		total_deferred;
408	bus_size_t	alignment;
409	bus_size_t	boundary;
410	bus_addr_t	lowaddr;
411	char		zoneid[8];
412	char		lowaddrid[20];
413	struct sysctl_ctx_list sysctl_tree;
414	struct sysctl_oid *sysctl_tree_top;
415};
416struct bus_dma_tag {
417	bus_dma_tag_t	  parent;
418	bus_size_t	  alignment;
419	bus_size_t	  boundary;
420	bus_addr_t	  lowaddr;
421	bus_addr_t	  highaddr;
422	bus_dma_filter_t *filter;
423	void		 *filterarg;
424	bus_size_t	  maxsize;
425	u_int		  nsegments;
426	bus_size_t	  maxsegsz;
427	int		  flags;
428	int		  ref_count;
429	int		  map_count;
430	bus_dma_lock_t	 *lockfunc;
431	void		 *lockfuncarg;
432	bus_dma_segment_t *segments;
433	struct bounce_zone *bounce_zone;
434};
435
436struct bus_dmamap {
437	struct bp_list	       bpages;
438	int		       pagesneeded;
439	int		       pagesreserved;
440	bus_dma_tag_t	       dmat;
441	void		      *buf;		/* unmapped buffer pointer */
442	bus_size_t	       buflen;		/* unmapped buffer length */
443	bus_dmamap_callback_t *callback;
444	void		      *callback_arg;
445	STAILQ_ENTRY(bus_dmamap) links;
446};
447
448static struct bus_dmamap nobounce_dmamap;
449
450static __inline int
451run_filter(bus_dma_tag_t dmat, bus_addr_t paddr)
452{
453	int retval;
454
455	retval = 0;
456
457	do {
458		if (((paddr > dmat->lowaddr && paddr <= dmat->highaddr)
459		 || ((paddr & (dmat->alignment - 1)) != 0))
460		 && (dmat->filter == NULL
461		  || (*dmat->filter)(dmat->filterarg, paddr) != 0))
462			retval = 1;
463
464		dmat = dmat->parent;
465	} while (retval == 0 && dmat != NULL);
466	return (retval);
467}
468
469static __inline int
470_bus_dmamap_load_buffer(bus_dma_tag_t dmat,
471    			bus_dmamap_t map,
472			void *buf, bus_size_t buflen,
473			pmap_t pmap,
474			int flags,
475			bus_addr_t *lastaddrp,
476			bus_dma_segment_t *segs,
477			int *segp,
478			int first)
479{
480	bus_size_t sgsize;
481	bus_addr_t curaddr, lastaddr, baddr, bmask;
482	vm_offset_t vaddr;
483	int needbounce = 0;
484	int seg;
485
486	if (map == NULL)
487		map = &nobounce_dmamap;
488
489	/* Reserve Necessary Bounce Pages */
490	if (map->pagesneeded != 0)
491		panic("don't support bounce pages");
492
493	vaddr = (vm_offset_t)buf;
494	lastaddr = *lastaddrp;
495	bmask = ~(dmat->boundary - 1);
496
497	for (seg = *segp; buflen > 0 ; ) {
498		/*
499		 * Get the physical address for this segment.
500		 */
501		if (pmap)
502			curaddr = pmap_extract(pmap, vaddr);
503		else
504			curaddr = pmap_kextract(vaddr);
505
506
507		/*
508		 * Compute the segment size, and adjust counts.
509		 */
510		sgsize = PAGE_SIZE - ((u_long)curaddr & PAGE_MASK);
511		if (buflen < sgsize)
512			sgsize = buflen;
513
514		/*
515		 * Make sure we don't cross any boundaries.
516		 */
517		if (dmat->boundary > 0) {
518			baddr = (curaddr + dmat->boundary) & bmask;
519			if (sgsize > (baddr - curaddr))
520				sgsize = (baddr - curaddr);
521		}
522
523		if (map->pagesneeded != 0 && run_filter(dmat, curaddr))
524			panic("no bounce page support");
525
526		/*
527		 * Insert chunk into a segment, coalescing with
528		 * previous segment if possible.
529		 */
530		if (first) {
531			segs[seg].ds_addr = curaddr;
532			segs[seg].ds_len = sgsize;
533			first = 0;
534		} else {
535			if (needbounce == 0 && curaddr == lastaddr &&
536			    (segs[seg].ds_len + sgsize) <= dmat->maxsegsz &&
537			    (dmat->boundary == 0 ||
538			     (segs[seg].ds_addr & bmask) == (curaddr & bmask)))
539				segs[seg].ds_len += sgsize;
540			else {
541				if (++seg >= dmat->nsegments)
542					break;
543				segs[seg].ds_addr = curaddr;
544				segs[seg].ds_len = sgsize;
545			}
546		}
547
548		lastaddr = curaddr + sgsize;
549		vaddr += sgsize;
550		buflen -= sgsize;
551	}
552
553	*segp = seg;
554	*lastaddrp = lastaddr;
555
556	/*
557	 * Did we fit?
558	 */
559	return (buflen != 0 ? EFBIG : 0); /* XXX better return value here? */
560}
561
562int
563bus_dmamap_load_mvec_sg(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0,
564                        bus_dma_segment_t *segs, int *nsegs, int flags)
565{
566	int error, i;
567
568	M_ASSERTPKTHDR(m0);
569
570	if ((m0->m_flags & M_IOVEC) == 0)
571		return (bus_dmamap_load_mbuf_sg(dmat, map, m0, segs, nsegs, flags));
572
573	flags |= BUS_DMA_NOWAIT;
574	*nsegs = 0;
575	error = 0;
576	if (m0->m_pkthdr.len <= dmat->maxsize) {
577		int first = 1;
578		bus_addr_t lastaddr = 0;
579		struct mbuf *m;
580
581		for (m = m0; m != NULL && error == 0; m = m->m_next) {
582			struct mbuf_vec *mv;
583			int count, firstcl;
584			if (!(m->m_len > 0))
585				continue;
586
587			mv = mtomv(m);
588			count = mv->mv_count;
589			firstcl = mv->mv_first;
590			KASSERT(count <= MAX_MBUF_IOV, ("count=%d too large", count));
591			for (i = firstcl; i < count && error == 0; i++) {
592				void *data = mv->mv_vec[i].mi_base + mv->mv_vec[i].mi_offset;
593				int len = mv->mv_vec[i].mi_len;
594
595				if (len == 0)
596					continue;
597				DPRINTF("mapping data=%p len=%d\n", data, len);
598				error = _bus_dmamap_load_buffer(dmat, NULL,
599				    data, len, NULL, flags, &lastaddr,
600				    segs, nsegs, first);
601				DPRINTF("%d: addr=0x%jx len=%ju\n", i,
602				    (uintmax_t)segs[i].ds_addr, (uintmax_t)segs[i].ds_len);
603				first = 0;
604			}
605		}
606	} else {
607		error = EINVAL;
608	}
609
610	(*nsegs)++;
611
612	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
613	    __func__, dmat, dmat->flags, error, *nsegs);
614	return (error);
615}
616#endif /* !__sparc64__  && !__sun4v__ */
617