1/*-
2 * Copyright (c) 2014-2018, Matthew Macy <mmacy@mattmacy.io>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 *  1. Redistributions of source code must retain the above copyright notice,
9 *     this list of conditions and the following disclaimer.
10 *
11 *  2. Neither the name of Matthew Macy nor the names of its
12 *     contributors may be used to endorse or promote products derived from
13 *     this software without specific prior written permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 * POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29#include <stdlib.h>
30__FBSDID("$FreeBSD$");
31
32#ifndef __HAIKU__
33#include "opt_inet.h"
34#include "opt_inet6.h"
35#include "opt_acpi.h"
36#include "opt_sched.h"
37#endif
38
39#include <sys/param.h>
40#include <sys/types.h>
41#include <sys/bus.h>
42#include <sys/eventhandler.h>
43#ifndef __HAIKU__
44#include <sys/jail.h>
45#endif
46#include <sys/kernel.h>
47#include <sys/lock.h>
48#include <sys/mutex.h>
49#include <sys/sx.h>
50#include <sys/module.h>
51#include <sys/kobj.h>
52#include <sys/rman.h>
53#include <sys/sbuf.h>
54#include <sys/smp.h>
55#include <sys/socket.h>
56#include <sys/sockio.h>
57#include <sys/sysctl.h>
58#include <sys/syslog.h>
59#include <sys/taskqueue.h>
60#include <sys/limits.h>
61
62#include <net/if.h>
63#include <net/if_var.h>
64#include <net/if_types.h>
65#include <net/if_media.h>
66#include <net/bpf.h>
67#include <net/ethernet.h>
68#include <net/if_vlan_var.h>
69#include <net/mp_ring.h>
70#include <net/vnet.h>
71#include <net/debugnet.h>
72
73#include <netinet/in.h>
74#ifndef __HAIKU__
75#include <netinet/in_pcb.h>
76#include <netinet/tcp_lro.h>
77#include <netinet/in_systm.h>
78#endif
79#include <netinet/if_ether.h>
80#include <netinet/ip.h>
81#include <netinet/ip6.h>
82#include <netinet/tcp.h>
83#include <netinet/ip_var.h>
84#ifndef __HAIKU__
85#include <netinet6/ip6_var.h>
86#endif
87
88#include <machine/bus.h>
89#ifndef __HAIKU__
90#include <machine/in_cksum.h>
91#endif
92
93#include <vm/vm.h>
94#include <vm/pmap.h>
95
96#include <dev/led/led.h>
97#include <dev/pci/pcireg.h>
98#include <dev/pci/pcivar.h>
99#ifndef __HAIKU__
100#include <dev/pci/pci_private.h>
101#endif
102
103#include <net/iflib.h>
104#include <net/iflib_private.h>
105
106#include <ifdi_if.h>
107#include <device_if.h>
108
109#ifdef PCI_IOV
110#include <dev/pci/pci_iov.h>
111#endif
112
113#include <sys/bitstring.h>
114
115/*
116 * enable accounting of every mbuf as it comes in to and goes out of
117 * iflib's software descriptor references
118 */
119#define MEMORY_LOGGING 0
120/*
121 * Enable mbuf vectors for compressing long mbuf chains
122 */
123
124/*
125 * NB:
126 * - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead
127 *   we prefetch needs to be determined by the time spent in m_free vis a vis
128 *   the cost of a prefetch. This will of course vary based on the workload:
129 *      - NFLX's m_free path is dominated by vm-based M_EXT manipulation which
130 *        is quite expensive, thus suggesting very little prefetch.
131 *      - small packet forwarding which is just returning a single mbuf to
132 *        UMA will typically be very fast vis a vis the cost of a memory
133 *        access.
134 */
135
136/*
137 * File organization:
138 *  - private structures
139 *  - iflib private utility functions
140 *  - ifnet functions
141 *  - vlan registry and other exported functions
142 *  - iflib public core functions
143 *
144 *
145 */
146MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library");
147
148#define	IFLIB_RXEOF_MORE (1U << 0)
149#define	IFLIB_RXEOF_EMPTY (2U << 0)
150
151struct iflib_txq;
152typedef struct iflib_txq *iflib_txq_t;
153struct iflib_rxq;
154typedef struct iflib_rxq *iflib_rxq_t;
155struct iflib_fl;
156typedef struct iflib_fl *iflib_fl_t;
157
158struct iflib_ctx;
159
160static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid);
161static void iflib_timer(void *arg);
162static void iflib_tqg_detach(if_ctx_t ctx);
163
164typedef struct iflib_filter_info {
165	driver_filter_t *ifi_filter;
166	void *ifi_filter_arg;
167	struct grouptask *ifi_task;
168	void *ifi_ctx;
169} *iflib_filter_info_t;
170
171struct iflib_ctx {
172	KOBJ_FIELDS;
173	/*
174	 * Pointer to hardware driver's softc
175	 */
176	void *ifc_softc;
177	device_t ifc_dev;
178	if_t ifc_ifp;
179
180#ifndef __HAIKU__
181	cpuset_t ifc_cpus;
182#endif
183	if_shared_ctx_t ifc_sctx;
184	struct if_softc_ctx ifc_softc_ctx;
185
186	struct sx ifc_ctx_sx;
187	struct mtx ifc_state_mtx;
188
189	iflib_txq_t ifc_txqs;
190	iflib_rxq_t ifc_rxqs;
191	uint32_t ifc_if_flags;
192	uint32_t ifc_flags;
193	uint32_t ifc_max_fl_buf_size;
194	uint32_t ifc_rx_mbuf_sz;
195
196	int ifc_link_state;
197	int ifc_watchdog_events;
198	struct cdev *ifc_led_dev;
199	struct resource *ifc_msix_mem;
200
201	struct if_irq ifc_legacy_irq;
202	struct grouptask ifc_admin_task;
203	struct grouptask ifc_vflr_task;
204	struct iflib_filter_info ifc_filter_info;
205	struct ifmedia	ifc_media;
206	struct ifmedia	*ifc_mediap;
207
208	struct sysctl_oid *ifc_sysctl_node;
209	uint16_t ifc_sysctl_ntxqs;
210	uint16_t ifc_sysctl_nrxqs;
211	uint16_t ifc_sysctl_qs_eq_override;
212	uint16_t ifc_sysctl_rx_budget;
213	uint16_t ifc_sysctl_tx_abdicate;
214	uint16_t ifc_sysctl_core_offset;
215#define	CORE_OFFSET_UNSPECIFIED	0xffff
216	uint8_t  ifc_sysctl_separate_txrx;
217	uint8_t  ifc_sysctl_use_logical_cores;
218	bool	 ifc_cpus_are_physical_cores;
219
220	qidx_t ifc_sysctl_ntxds[8];
221	qidx_t ifc_sysctl_nrxds[8];
222	struct if_txrx ifc_txrx;
223#define isc_txd_encap  ifc_txrx.ift_txd_encap
224#define isc_txd_flush  ifc_txrx.ift_txd_flush
225#define isc_txd_credits_update  ifc_txrx.ift_txd_credits_update
226#define isc_rxd_available ifc_txrx.ift_rxd_available
227#define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get
228#define isc_rxd_refill ifc_txrx.ift_rxd_refill
229#define isc_rxd_flush ifc_txrx.ift_rxd_flush
230#define isc_legacy_intr ifc_txrx.ift_legacy_intr
231	eventhandler_tag ifc_vlan_attach_event;
232	eventhandler_tag ifc_vlan_detach_event;
233	struct ether_addr ifc_mac;
234};
235
236void *
237iflib_get_softc(if_ctx_t ctx)
238{
239
240	return (ctx->ifc_softc);
241}
242
243device_t
244iflib_get_dev(if_ctx_t ctx)
245{
246
247	return (ctx->ifc_dev);
248}
249
250if_t
251iflib_get_ifp(if_ctx_t ctx)
252{
253
254	return (ctx->ifc_ifp);
255}
256
257struct ifmedia *
258iflib_get_media(if_ctx_t ctx)
259{
260
261	return (ctx->ifc_mediap);
262}
263
264uint32_t
265iflib_get_flags(if_ctx_t ctx)
266{
267	return (ctx->ifc_flags);
268}
269
270void
271iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN])
272{
273
274	bcopy(mac, ctx->ifc_mac.octet, ETHER_ADDR_LEN);
275}
276
277if_softc_ctx_t
278iflib_get_softc_ctx(if_ctx_t ctx)
279{
280
281	return (&ctx->ifc_softc_ctx);
282}
283
284if_shared_ctx_t
285iflib_get_sctx(if_ctx_t ctx)
286{
287
288	return (ctx->ifc_sctx);
289}
290
291#define IP_ALIGNED(m) ((((uintptr_t)(m)->m_data) & 0x3) == 0x2)
292#define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*))
293#define CACHE_PTR_NEXT(ptr) ((void *)(((uintptr_t)(ptr)+CACHE_LINE_SIZE-1) & (CACHE_LINE_SIZE-1)))
294
295#define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP)
296#define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
297
298typedef struct iflib_sw_rx_desc_array {
299	bus_dmamap_t	*ifsd_map;         /* bus_dma maps for packet */
300	struct mbuf	**ifsd_m;           /* pkthdr mbufs */
301	caddr_t		*ifsd_cl;          /* direct cluster pointer for rx */
302	bus_addr_t	*ifsd_ba;          /* bus addr of cluster for rx */
303} iflib_rxsd_array_t;
304
305typedef struct iflib_sw_tx_desc_array {
306	bus_dmamap_t    *ifsd_map;         /* bus_dma maps for packet */
307	bus_dmamap_t	*ifsd_tso_map;     /* bus_dma maps for TSO packet */
308	struct mbuf    **ifsd_m;           /* pkthdr mbufs */
309} if_txsd_vec_t;
310
311/* magic number that should be high enough for any hardware */
312#define IFLIB_MAX_TX_SEGS		128
313#define IFLIB_RX_COPY_THRESH		128
314#define IFLIB_MAX_RX_REFRESH		32
315/* The minimum descriptors per second before we start coalescing */
316#define IFLIB_MIN_DESC_SEC		16384
317#define IFLIB_DEFAULT_TX_UPDATE_FREQ	16
318#define IFLIB_QUEUE_IDLE		0
319#define IFLIB_QUEUE_HUNG		1
320#define IFLIB_QUEUE_WORKING		2
321/* maximum number of txqs that can share an rx interrupt */
322#define IFLIB_MAX_TX_SHARED_INTR	4
323
324/* this should really scale with ring size - this is a fairly arbitrary value */
325#define TX_BATCH_SIZE			32
326
327#define IFLIB_RESTART_BUDGET		8
328
329#define CSUM_OFFLOAD		(CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \
330				 CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \
331				 CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP)
332
333struct iflib_txq {
334	qidx_t		ift_in_use;
335	qidx_t		ift_cidx;
336	qidx_t		ift_cidx_processed;
337	qidx_t		ift_pidx;
338	uint8_t		ift_gen;
339	uint8_t		ift_br_offset;
340	uint16_t	ift_npending;
341	uint16_t	ift_db_pending;
342	uint16_t	ift_rs_pending;
343	/* implicit pad */
344	uint8_t		ift_txd_size[8];
345	uint64_t	ift_processed;
346	uint64_t	ift_cleaned;
347	uint64_t	ift_cleaned_prev;
348#if MEMORY_LOGGING
349	uint64_t	ift_enqueued;
350	uint64_t	ift_dequeued;
351#endif
352	uint64_t	ift_no_tx_dma_setup;
353	uint64_t	ift_no_desc_avail;
354	uint64_t	ift_mbuf_defrag_failed;
355	uint64_t	ift_mbuf_defrag;
356	uint64_t	ift_map_failed;
357	uint64_t	ift_txd_encap_efbig;
358	uint64_t	ift_pullups;
359	uint64_t	ift_last_timer_tick;
360
361	struct mtx	ift_mtx;
362	struct mtx	ift_db_mtx;
363
364	/* constant values */
365	if_ctx_t	ift_ctx;
366	struct ifmp_ring        *ift_br;
367	struct grouptask	ift_task;
368	qidx_t		ift_size;
369	uint16_t	ift_id;
370	struct callout	ift_timer;
371#ifdef DEV_NETMAP
372	struct callout	ift_netmap_timer;
373#endif /* DEV_NETMAP */
374
375	if_txsd_vec_t	ift_sds;
376	uint8_t		ift_qstatus;
377	uint8_t		ift_closed;
378	uint8_t		ift_update_freq;
379	struct iflib_filter_info ift_filter_info;
380	bus_dma_tag_t	ift_buf_tag;
381	bus_dma_tag_t	ift_tso_buf_tag;
382	iflib_dma_info_t	ift_ifdi;
383#define	MTX_NAME_LEN	32
384	char                    ift_mtx_name[MTX_NAME_LEN];
385	bus_dma_segment_t	ift_segs[IFLIB_MAX_TX_SEGS]  __aligned(CACHE_LINE_SIZE);
386#ifdef IFLIB_DIAGNOSTICS
387	uint64_t ift_cpu_exec_count[256];
388#endif
389} __aligned(CACHE_LINE_SIZE);
390
391struct iflib_fl {
392	qidx_t		ifl_cidx;
393	qidx_t		ifl_pidx;
394	qidx_t		ifl_credits;
395	uint8_t		ifl_gen;
396	uint8_t		ifl_rxd_size;
397#if MEMORY_LOGGING
398	uint64_t	ifl_m_enqueued;
399	uint64_t	ifl_m_dequeued;
400	uint64_t	ifl_cl_enqueued;
401	uint64_t	ifl_cl_dequeued;
402#endif
403	/* implicit pad */
404	bitstr_t 	*ifl_rx_bitmap;
405	qidx_t		ifl_fragidx;
406	/* constant */
407	qidx_t		ifl_size;
408	uint16_t	ifl_buf_size;
409	uint16_t	ifl_cltype;
410#ifndef __HAIKU__
411	uma_zone_t	ifl_zone;
412#endif
413	iflib_rxsd_array_t	ifl_sds;
414	iflib_rxq_t	ifl_rxq;
415	uint8_t		ifl_id;
416	bus_dma_tag_t	ifl_buf_tag;
417	iflib_dma_info_t	ifl_ifdi;
418	uint64_t	ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE);
419	qidx_t		ifl_rxd_idxs[IFLIB_MAX_RX_REFRESH];
420}  __aligned(CACHE_LINE_SIZE);
421
422static inline qidx_t
423get_inuse(int size, qidx_t cidx, qidx_t pidx, uint8_t gen)
424{
425	qidx_t used;
426
427	if (pidx > cidx)
428		used = pidx - cidx;
429	else if (pidx < cidx)
430		used = size - cidx + pidx;
431	else if (gen == 0 && pidx == cidx)
432		used = 0;
433	else if (gen == 1 && pidx == cidx)
434		used = size;
435	else
436		panic("bad state");
437
438	return (used);
439}
440
441#define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen))
442
443#define IDXDIFF(head, tail, wrap) \
444	((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
445
446struct iflib_rxq {
447	if_ctx_t	ifr_ctx;
448	iflib_fl_t	ifr_fl;
449	uint64_t	ifr_rx_irq;
450#ifndef __HAIKU__
451	struct pfil_head	*pfil;
452#else
453#define PFIL_PASS 0
454#endif
455	/*
456	 * If there is a separate completion queue (IFLIB_HAS_RXCQ), this is
457	 * the completion queue consumer index.  Otherwise it's unused.
458	 */
459	qidx_t		ifr_cq_cidx;
460	uint16_t	ifr_id;
461	uint8_t		ifr_nfl;
462	uint8_t		ifr_ntxqirq;
463	uint8_t		ifr_txqid[IFLIB_MAX_TX_SHARED_INTR];
464	uint8_t		ifr_fl_offset;
465#ifndef __HAIKU__
466	struct lro_ctrl			ifr_lc;
467#endif
468	struct grouptask        ifr_task;
469	struct callout		ifr_watchdog;
470	struct iflib_filter_info ifr_filter_info;
471	iflib_dma_info_t		ifr_ifdi;
472
473	/* dynamically allocate if any drivers need a value substantially larger than this */
474	struct if_rxd_frag	ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE);
475#ifdef IFLIB_DIAGNOSTICS
476	uint64_t ifr_cpu_exec_count[256];
477#endif
478}  __aligned(CACHE_LINE_SIZE);
479
480typedef struct if_rxsd {
481	caddr_t *ifsd_cl;
482	iflib_fl_t ifsd_fl;
483} *if_rxsd_t;
484
485/* multiple of word size */
486#ifdef __LP64__
487#define PKT_INFO_SIZE	6
488#define RXD_INFO_SIZE	5
489#define PKT_TYPE uint64_t
490#else
491#define PKT_INFO_SIZE	11
492#define RXD_INFO_SIZE	8
493#define PKT_TYPE uint32_t
494#endif
495#define PKT_LOOP_BOUND  ((PKT_INFO_SIZE/3)*3)
496#define RXD_LOOP_BOUND  ((RXD_INFO_SIZE/4)*4)
497
498typedef struct if_pkt_info_pad {
499	PKT_TYPE pkt_val[PKT_INFO_SIZE];
500} *if_pkt_info_pad_t;
501typedef struct if_rxd_info_pad {
502	PKT_TYPE rxd_val[RXD_INFO_SIZE];
503} *if_rxd_info_pad_t;
504
505CTASSERT(sizeof(struct if_pkt_info_pad) == sizeof(struct if_pkt_info));
506CTASSERT(sizeof(struct if_rxd_info_pad) == sizeof(struct if_rxd_info));
507
508static inline void
509pkt_info_zero(if_pkt_info_t pi)
510{
511	if_pkt_info_pad_t pi_pad;
512
513	pi_pad = (if_pkt_info_pad_t)pi;
514	pi_pad->pkt_val[0] = 0; pi_pad->pkt_val[1] = 0; pi_pad->pkt_val[2] = 0;
515	pi_pad->pkt_val[3] = 0; pi_pad->pkt_val[4] = 0; pi_pad->pkt_val[5] = 0;
516#ifndef __LP64__
517	pi_pad->pkt_val[6] = 0; pi_pad->pkt_val[7] = 0; pi_pad->pkt_val[8] = 0;
518	pi_pad->pkt_val[9] = 0; pi_pad->pkt_val[10] = 0;
519#endif
520}
521
522#ifndef __HAIKU__
523static device_method_t iflib_pseudo_methods[] = {
524	DEVMETHOD(device_attach, noop_attach),
525	DEVMETHOD(device_detach, iflib_pseudo_detach),
526	DEVMETHOD_END
527};
528
529driver_t iflib_pseudodriver = {
530	"iflib_pseudo", iflib_pseudo_methods, sizeof(struct iflib_ctx),
531};
532#endif
533
534static inline void
535rxd_info_zero(if_rxd_info_t ri)
536{
537	if_rxd_info_pad_t ri_pad;
538	int i;
539
540	ri_pad = (if_rxd_info_pad_t)ri;
541	for (i = 0; i < RXD_LOOP_BOUND; i += 4) {
542		ri_pad->rxd_val[i] = 0;
543		ri_pad->rxd_val[i+1] = 0;
544		ri_pad->rxd_val[i+2] = 0;
545		ri_pad->rxd_val[i+3] = 0;
546	}
547#ifdef __LP64__
548	ri_pad->rxd_val[RXD_INFO_SIZE-1] = 0;
549#endif
550}
551
552/*
553 * Only allow a single packet to take up most 1/nth of the tx ring
554 */
555#define MAX_SINGLE_PACKET_FRACTION 12
556#define IF_BAD_DMA (bus_addr_t)-1
557
558#define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING))
559
560#define CTX_LOCK_INIT(_sc)  sx_init(&(_sc)->ifc_ctx_sx, "iflib ctx lock")
561#define CTX_LOCK(ctx) sx_xlock(&(ctx)->ifc_ctx_sx)
562#define CTX_UNLOCK(ctx) sx_xunlock(&(ctx)->ifc_ctx_sx)
563#define CTX_LOCK_DESTROY(ctx) sx_destroy(&(ctx)->ifc_ctx_sx)
564
565#define STATE_LOCK_INIT(_sc, _name)  mtx_init(&(_sc)->ifc_state_mtx, _name, "iflib state lock", MTX_DEF)
566#define STATE_LOCK(ctx) mtx_lock(&(ctx)->ifc_state_mtx)
567#define STATE_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_state_mtx)
568#define STATE_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_state_mtx)
569
570#define CALLOUT_LOCK(txq)	mtx_lock(&txq->ift_mtx)
571#define CALLOUT_UNLOCK(txq) 	mtx_unlock(&txq->ift_mtx)
572
573void
574iflib_set_detach(if_ctx_t ctx)
575{
576	STATE_LOCK(ctx);
577	ctx->ifc_flags |= IFC_IN_DETACH;
578	STATE_UNLOCK(ctx);
579}
580
581/* Our boot-time initialization hook */
582static int	iflib_module_event_handler(module_t, int, void *);
583
584#ifndef __HAIKU__
585static moduledata_t iflib_moduledata = {
586	"iflib",
587	iflib_module_event_handler,
588	NULL
589};
590#endif
591
592DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY);
593MODULE_VERSION(iflib, 1);
594
595MODULE_DEPEND(iflib, pci, 1, 1, 1);
596MODULE_DEPEND(iflib, ether, 1, 1, 1);
597
598TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
599TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
600
601#ifndef IFLIB_DEBUG_COUNTERS
602#ifdef INVARIANTS
603#define IFLIB_DEBUG_COUNTERS 1
604#else
605#define IFLIB_DEBUG_COUNTERS 0
606#endif /* !INVARIANTS */
607#endif
608
609static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
610    "iflib driver parameters");
611
612/*
613 * XXX need to ensure that this can't accidentally cause the head to be moved backwards
614 */
615static int iflib_min_tx_latency = 0;
616SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW,
617		   &iflib_min_tx_latency, 0, "minimize transmit latency at the possible expense of throughput");
618static int iflib_no_tx_batch = 0;
619SYSCTL_INT(_net_iflib, OID_AUTO, no_tx_batch, CTLFLAG_RW,
620		   &iflib_no_tx_batch, 0, "minimize transmit latency at the possible expense of throughput");
621static int iflib_timer_default = 1000;
622SYSCTL_INT(_net_iflib, OID_AUTO, timer_default, CTLFLAG_RW,
623		   &iflib_timer_default, 0, "number of ticks between iflib_timer calls");
624
625
626#if IFLIB_DEBUG_COUNTERS
627
628static int iflib_tx_seen;
629static int iflib_tx_sent;
630static int iflib_tx_encap;
631static int iflib_rx_allocs;
632static int iflib_fl_refills;
633static int iflib_fl_refills_large;
634static int iflib_tx_frees;
635
636SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD,
637		   &iflib_tx_seen, 0, "# TX mbufs seen");
638SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD,
639		   &iflib_tx_sent, 0, "# TX mbufs sent");
640SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD,
641		   &iflib_tx_encap, 0, "# TX mbufs encapped");
642SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD,
643		   &iflib_tx_frees, 0, "# TX frees");
644SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD,
645		   &iflib_rx_allocs, 0, "# RX allocations");
646SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD,
647		   &iflib_fl_refills, 0, "# refills");
648SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD,
649		   &iflib_fl_refills_large, 0, "# large refills");
650
651static int iflib_txq_drain_flushing;
652static int iflib_txq_drain_oactive;
653static int iflib_txq_drain_notready;
654
655SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD,
656		   &iflib_txq_drain_flushing, 0, "# drain flushes");
657SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD,
658		   &iflib_txq_drain_oactive, 0, "# drain oactives");
659SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD,
660		   &iflib_txq_drain_notready, 0, "# drain notready");
661
662static int iflib_encap_load_mbuf_fail;
663static int iflib_encap_pad_mbuf_fail;
664static int iflib_encap_txq_avail_fail;
665static int iflib_encap_txd_encap_fail;
666
667SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD,
668		   &iflib_encap_load_mbuf_fail, 0, "# busdma load failures");
669SYSCTL_INT(_net_iflib, OID_AUTO, encap_pad_mbuf_fail, CTLFLAG_RD,
670		   &iflib_encap_pad_mbuf_fail, 0, "# runt frame pad failures");
671SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD,
672		   &iflib_encap_txq_avail_fail, 0, "# txq avail failures");
673SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD,
674		   &iflib_encap_txd_encap_fail, 0, "# driver encap failures");
675
676static int iflib_task_fn_rxs;
677static int iflib_rx_intr_enables;
678static int iflib_fast_intrs;
679static int iflib_rx_unavail;
680static int iflib_rx_ctx_inactive;
681static int iflib_rx_if_input;
682static int iflib_rxd_flush;
683
684static int iflib_verbose_debug;
685
686SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD,
687		   &iflib_task_fn_rxs, 0, "# task_fn_rx calls");
688SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD,
689		   &iflib_rx_intr_enables, 0, "# RX intr enables");
690SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD,
691		   &iflib_fast_intrs, 0, "# fast_intr calls");
692SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD,
693		   &iflib_rx_unavail, 0, "# times rxeof called with no available data");
694SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD,
695		   &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context");
696SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD,
697		   &iflib_rx_if_input, 0, "# times rxeof called if_input");
698SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD,
699	         &iflib_rxd_flush, 0, "# times rxd_flush called");
700SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
701		   &iflib_verbose_debug, 0, "enable verbose debugging");
702
703#define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1)
704static void
705iflib_debug_reset(void)
706{
707	iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs =
708		iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees =
709		iflib_txq_drain_flushing = iflib_txq_drain_oactive =
710		iflib_txq_drain_notready =
711		iflib_encap_load_mbuf_fail = iflib_encap_pad_mbuf_fail =
712		iflib_encap_txq_avail_fail = iflib_encap_txd_encap_fail =
713		iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs =
714		iflib_rx_unavail =
715		iflib_rx_ctx_inactive = iflib_rx_if_input =
716		iflib_rxd_flush = 0;
717}
718
719#else
720#define DBG_COUNTER_INC(name)
721static void iflib_debug_reset(void) {}
722#endif
723
724#define IFLIB_DEBUG 0
725
726static void iflib_tx_structures_free(if_ctx_t ctx);
727static void iflib_rx_structures_free(if_ctx_t ctx);
728static int iflib_queues_alloc(if_ctx_t ctx);
729static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq);
730static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget);
731static int iflib_qset_structures_setup(if_ctx_t ctx);
732static int iflib_msix_init(if_ctx_t ctx);
733static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, const char *str);
734static void iflib_txq_check_drain(iflib_txq_t txq, int budget);
735static uint32_t iflib_txq_can_drain(struct ifmp_ring *);
736#ifdef ALTQ
737static void iflib_altq_if_start(if_t ifp);
738static int iflib_altq_if_transmit(if_t ifp, struct mbuf *m);
739#endif
740static int iflib_register(if_ctx_t);
741static void iflib_deregister(if_ctx_t);
742static void iflib_unregister_vlan_handlers(if_ctx_t ctx);
743static uint16_t iflib_get_mbuf_size_for(unsigned int size);
744static void iflib_init_locked(if_ctx_t ctx);
745static void iflib_add_device_sysctl_pre(if_ctx_t ctx);
746static void iflib_add_device_sysctl_post(if_ctx_t ctx);
747static void iflib_ifmp_purge(iflib_txq_t txq);
748static void _iflib_pre_assert(if_softc_ctx_t scctx);
749static void iflib_if_init_locked(if_ctx_t ctx);
750static void iflib_free_intr_mem(if_ctx_t ctx);
751#ifndef __NO_STRICT_ALIGNMENT
752static struct mbuf * iflib_fixup_rx(struct mbuf *m);
753#endif
754
755#ifndef __HAIKU__
756static SLIST_HEAD(cpu_offset_list, cpu_offset) cpu_offsets =
757    SLIST_HEAD_INITIALIZER(cpu_offsets);
758struct cpu_offset {
759	SLIST_ENTRY(cpu_offset) entries;
760	cpuset_t	set;
761	unsigned int	refcount;
762	uint16_t	next_cpuid;
763};
764static struct mtx cpu_offset_mtx;
765MTX_SYSINIT(iflib_cpu_offset, &cpu_offset_mtx, "iflib_cpu_offset lock",
766    MTX_DEF);
767#endif
768
769DEBUGNET_DEFINE(iflib);
770
771static int
772iflib_num_rx_descs(if_ctx_t ctx)
773{
774	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
775	if_shared_ctx_t sctx = ctx->ifc_sctx;
776	uint16_t first_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0;
777
778	return scctx->isc_nrxd[first_rxq];
779}
780
781static int
782iflib_num_tx_descs(if_ctx_t ctx)
783{
784	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
785	if_shared_ctx_t sctx = ctx->ifc_sctx;
786	uint16_t first_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0;
787
788	return scctx->isc_ntxd[first_txq];
789}
790
791#ifdef DEV_NETMAP
792#include <sys/selinfo.h>
793#include <net/netmap.h>
794#include <dev/netmap/netmap_kern.h>
795
796MODULE_DEPEND(iflib, netmap, 1, 1, 1);
797
798static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init);
799static void iflib_netmap_timer(void *arg);
800
801/*
802 * device-specific sysctl variables:
803 *
804 * iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
805 *	During regular operations the CRC is stripped, but on some
806 *	hardware reception of frames not multiple of 64 is slower,
807 *	so using crcstrip=0 helps in benchmarks.
808 *
809 * iflib_rx_miss, iflib_rx_miss_bufs:
810 *	count packets that might be missed due to lost interrupts.
811 */
812SYSCTL_DECL(_dev_netmap);
813/*
814 * The xl driver by default strips CRCs and we do not override it.
815 */
816
817int iflib_crcstrip = 1;
818SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip,
819    CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on RX frames");
820
821int iflib_rx_miss, iflib_rx_miss_bufs;
822SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss,
823    CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed RX intr");
824SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs,
825    CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed RX intr bufs");
826
827/*
828 * Register/unregister. We are already under netmap lock.
829 * Only called on the first register or the last unregister.
830 */
831static int
832iflib_netmap_register(struct netmap_adapter *na, int onoff)
833{
834	if_t ifp = na->ifp;
835	if_ctx_t ctx = ifp->if_softc;
836	int status;
837
838	CTX_LOCK(ctx);
839	if (!CTX_IS_VF(ctx))
840		IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip);
841
842	iflib_stop(ctx);
843
844	/*
845	 * Enable (or disable) netmap flags, and intercept (or restore)
846	 * ifp->if_transmit. This is done once the device has been stopped
847	 * to prevent race conditions. Also, this must be done after
848	 * calling netmap_disable_all_rings() and before calling
849	 * netmap_enable_all_rings(), so that these two functions see the
850	 * updated state of the NAF_NETMAP_ON bit.
851	 */
852	if (onoff) {
853		nm_set_native_flags(na);
854	} else {
855		nm_clear_native_flags(na);
856	}
857
858	iflib_init_locked(ctx);
859	IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ?
860	status = ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1;
861	if (status)
862		nm_clear_native_flags(na);
863	CTX_UNLOCK(ctx);
864	return (status);
865}
866
867static int
868iflib_netmap_config(struct netmap_adapter *na, struct nm_config_info *info)
869{
870	if_t ifp = na->ifp;
871	if_ctx_t ctx = ifp->if_softc;
872	iflib_rxq_t rxq = &ctx->ifc_rxqs[0];
873	iflib_fl_t fl = &rxq->ifr_fl[0];
874
875	info->num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
876	info->num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
877	info->num_tx_descs = iflib_num_tx_descs(ctx);
878	info->num_rx_descs = iflib_num_rx_descs(ctx);
879	info->rx_buf_maxsize = fl->ifl_buf_size;
880	nm_prinf("txr %u rxr %u txd %u rxd %u rbufsz %u",
881		info->num_tx_rings, info->num_rx_rings, info->num_tx_descs,
882		info->num_rx_descs, info->rx_buf_maxsize);
883
884	return 0;
885}
886
887static int
888netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init)
889{
890	struct netmap_adapter *na = kring->na;
891	u_int const lim = kring->nkr_num_slots - 1;
892	struct netmap_ring *ring = kring->ring;
893	bus_dmamap_t *map;
894	struct if_rxd_update iru;
895	if_ctx_t ctx = rxq->ifr_ctx;
896	iflib_fl_t fl = &rxq->ifr_fl[0];
897	u_int nic_i_first, nic_i;
898	u_int nm_i;
899	int i, n;
900#if IFLIB_DEBUG_COUNTERS
901	int rf_count = 0;
902#endif
903
904	/*
905	 * This function is used both at initialization and in rxsync.
906	 * At initialization we need to prepare (with isc_rxd_refill())
907	 * all the netmap buffers currently owned by the kernel, in
908	 * such a way to keep fl->ifl_pidx and kring->nr_hwcur in sync
909	 * (except for kring->nkr_hwofs). These may be less than
910	 * kring->nkr_num_slots if netmap_reset() was called while
911	 * an application using the kring that still owned some
912	 * buffers.
913	 * At rxsync time, both indexes point to the next buffer to be
914	 * refilled.
915	 * In any case we publish (with isc_rxd_flush()) up to
916	 * (fl->ifl_pidx - 1) % N (included), to avoid the NIC tail/prod
917	 * pointer to overrun the head/cons pointer, although this is
918	 * not necessary for some NICs (e.g. vmx).
919	 */
920	if (__predict_false(init)) {
921		n = kring->nkr_num_slots - nm_kr_rxspace(kring);
922	} else {
923		n = kring->rhead - kring->nr_hwcur;
924		if (n == 0)
925			return (0); /* Nothing to do. */
926		if (n < 0)
927			n += kring->nkr_num_slots;
928	}
929
930	iru_init(&iru, rxq, 0 /* flid */);
931	map = fl->ifl_sds.ifsd_map;
932	nic_i = fl->ifl_pidx;
933	nm_i = netmap_idx_n2k(kring, nic_i);
934	if (__predict_false(init)) {
935		/*
936		 * On init/reset, nic_i must be 0, and we must
937		 * start to refill from hwtail (see netmap_reset()).
938		 */
939		MPASS(nic_i == 0);
940		MPASS(nm_i == kring->nr_hwtail);
941	} else
942		MPASS(nm_i == kring->nr_hwcur);
943	DBG_COUNTER_INC(fl_refills);
944	while (n > 0) {
945#if IFLIB_DEBUG_COUNTERS
946		if (++rf_count == 9)
947			DBG_COUNTER_INC(fl_refills_large);
948#endif
949		nic_i_first = nic_i;
950		for (i = 0; n > 0 && i < IFLIB_MAX_RX_REFRESH; n--, i++) {
951			struct netmap_slot *slot = &ring->slot[nm_i];
952			void *addr = PNMB(na, slot, &fl->ifl_bus_addrs[i]);
953
954			MPASS(i < IFLIB_MAX_RX_REFRESH);
955
956			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
957			        return netmap_ring_reinit(kring);
958
959			fl->ifl_rxd_idxs[i] = nic_i;
960
961			if (__predict_false(init)) {
962				netmap_load_map(na, fl->ifl_buf_tag,
963				    map[nic_i], addr);
964			} else if (slot->flags & NS_BUF_CHANGED) {
965				/* buffer has changed, reload map */
966				netmap_reload_map(na, fl->ifl_buf_tag,
967				    map[nic_i], addr);
968			}
969			bus_dmamap_sync(fl->ifl_buf_tag, map[nic_i],
970			    BUS_DMASYNC_PREREAD);
971			slot->flags &= ~NS_BUF_CHANGED;
972
973			nm_i = nm_next(nm_i, lim);
974			nic_i = nm_next(nic_i, lim);
975		}
976
977		iru.iru_pidx = nic_i_first;
978		iru.iru_count = i;
979		ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
980	}
981	fl->ifl_pidx = nic_i;
982	/*
983	 * At the end of the loop we must have refilled everything
984	 * we could possibly refill.
985	 */
986	MPASS(nm_i == kring->rhead);
987	kring->nr_hwcur = nm_i;
988
989	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
990	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
991	ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id,
992	    nm_prev(nic_i, lim));
993	DBG_COUNTER_INC(rxd_flush);
994
995	return (0);
996}
997
998#define NETMAP_TX_TIMER_US	90
999
1000/*
1001 * Reconcile kernel and user view of the transmit ring.
1002 *
1003 * All information is in the kring.
1004 * Userspace wants to send packets up to the one before kring->rhead,
1005 * kernel knows kring->nr_hwcur is the first unsent packet.
1006 *
1007 * Here we push packets out (as many as possible), and possibly
1008 * reclaim buffers from previously completed transmission.
1009 *
1010 * The caller (netmap) guarantees that there is only one instance
1011 * running at any time. Any interference with other driver
1012 * methods should be handled by the individual drivers.
1013 */
1014static int
1015iflib_netmap_txsync(struct netmap_kring *kring, int flags)
1016{
1017	struct netmap_adapter *na = kring->na;
1018	if_t ifp = na->ifp;
1019	struct netmap_ring *ring = kring->ring;
1020	u_int nm_i;	/* index into the netmap kring */
1021	u_int nic_i;	/* index into the NIC ring */
1022	u_int n;
1023	u_int const lim = kring->nkr_num_slots - 1;
1024	u_int const head = kring->rhead;
1025	struct if_pkt_info pi;
1026	int tx_pkts = 0, tx_bytes = 0;
1027
1028	/*
1029	 * interrupts on every tx packet are expensive so request
1030	 * them every half ring, or where NS_REPORT is set
1031	 */
1032	u_int report_frequency = kring->nkr_num_slots >> 1;
1033	/* device-specific */
1034	if_ctx_t ctx = ifp->if_softc;
1035	iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id];
1036
1037	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
1038	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1039
1040	/*
1041	 * First part: process new packets to send.
1042	 * nm_i is the current index in the netmap kring,
1043	 * nic_i is the corresponding index in the NIC ring.
1044	 *
1045	 * If we have packets to send (nm_i != head)
1046	 * iterate over the netmap ring, fetch length and update
1047	 * the corresponding slot in the NIC ring. Some drivers also
1048	 * need to update the buffer's physical address in the NIC slot
1049	 * even NS_BUF_CHANGED is not set (PNMB computes the addresses).
1050	 *
1051	 * The netmap_reload_map() calls is especially expensive,
1052	 * even when (as in this case) the tag is 0, so do only
1053	 * when the buffer has actually changed.
1054	 *
1055	 * If possible do not set the report/intr bit on all slots,
1056	 * but only a few times per ring or when NS_REPORT is set.
1057	 *
1058	 * Finally, on 10G and faster drivers, it might be useful
1059	 * to prefetch the next slot and txr entry.
1060	 */
1061
1062	nm_i = kring->nr_hwcur;
1063	if (nm_i != head) {	/* we have new packets to send */
1064		uint32_t pkt_len = 0, seg_idx = 0;
1065		int nic_i_start = -1, flags = 0;
1066		pkt_info_zero(&pi);
1067		pi.ipi_segs = txq->ift_segs;
1068		pi.ipi_qsidx = kring->ring_id;
1069		nic_i = netmap_idx_k2n(kring, nm_i);
1070
1071		__builtin_prefetch(&ring->slot[nm_i]);
1072		__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]);
1073		__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
1074
1075		for (n = 0; nm_i != head; n++) {
1076			struct netmap_slot *slot = &ring->slot[nm_i];
1077			u_int len = slot->len;
1078			uint64_t paddr;
1079			void *addr = PNMB(na, slot, &paddr);
1080
1081			flags |= (slot->flags & NS_REPORT ||
1082				nic_i == 0 || nic_i == report_frequency) ?
1083				IPI_TX_INTR : 0;
1084
1085			/*
1086			 * If this is the first packet fragment, save the
1087			 * index of the first NIC slot for later.
1088			 */
1089			if (nic_i_start < 0)
1090				nic_i_start = nic_i;
1091
1092			pi.ipi_segs[seg_idx].ds_addr = paddr;
1093			pi.ipi_segs[seg_idx].ds_len = len;
1094			if (len) {
1095				pkt_len += len;
1096				seg_idx++;
1097			}
1098
1099			if (!(slot->flags & NS_MOREFRAG)) {
1100				pi.ipi_len = pkt_len;
1101				pi.ipi_nsegs = seg_idx;
1102				pi.ipi_pidx = nic_i_start;
1103				pi.ipi_ndescs = 0;
1104				pi.ipi_flags = flags;
1105
1106				/* Prepare the NIC TX ring. */
1107				ctx->isc_txd_encap(ctx->ifc_softc, &pi);
1108				DBG_COUNTER_INC(tx_encap);
1109
1110				/* Update transmit counters */
1111				tx_bytes += pi.ipi_len;
1112				tx_pkts++;
1113
1114				/* Reinit per-packet info for the next one. */
1115				flags = seg_idx = pkt_len = 0;
1116				nic_i_start = -1;
1117			}
1118
1119			/* prefetch for next round */
1120			__builtin_prefetch(&ring->slot[nm_i + 1]);
1121			__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]);
1122			__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
1123
1124			NM_CHECK_ADDR_LEN(na, addr, len);
1125
1126			if (slot->flags & NS_BUF_CHANGED) {
1127				/* buffer has changed, reload map */
1128				netmap_reload_map(na, txq->ift_buf_tag,
1129				    txq->ift_sds.ifsd_map[nic_i], addr);
1130			}
1131			/* make sure changes to the buffer are synced */
1132			bus_dmamap_sync(txq->ift_buf_tag,
1133			    txq->ift_sds.ifsd_map[nic_i],
1134			    BUS_DMASYNC_PREWRITE);
1135
1136			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED | NS_MOREFRAG);
1137			nm_i = nm_next(nm_i, lim);
1138			nic_i = nm_next(nic_i, lim);
1139		}
1140		kring->nr_hwcur = nm_i;
1141
1142		/* synchronize the NIC ring */
1143		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
1144		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1145
1146		/* (re)start the tx unit up to slot nic_i (excluded) */
1147		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i);
1148	}
1149
1150	/*
1151	 * Second part: reclaim buffers for completed transmissions.
1152	 *
1153	 * If there are unclaimed buffers, attempt to reclaim them.
1154	 * If we don't manage to reclaim them all, and TX IRQs are not in use,
1155	 * trigger a per-tx-queue timer to try again later.
1156	 */
1157	if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
1158		if (iflib_tx_credits_update(ctx, txq)) {
1159			/* some tx completed, increment avail */
1160			nic_i = txq->ift_cidx_processed;
1161			kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
1162		}
1163	}
1164
1165	if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ))
1166		if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
1167			callout_reset_sbt_on(&txq->ift_netmap_timer,
1168			    NETMAP_TX_TIMER_US * SBT_1US, SBT_1US,
1169			    iflib_netmap_timer, txq,
1170			    txq->ift_netmap_timer.c_cpu, 0);
1171		}
1172
1173	if_inc_counter(ifp, IFCOUNTER_OBYTES, tx_bytes);
1174	if_inc_counter(ifp, IFCOUNTER_OPACKETS, tx_pkts);
1175
1176	return (0);
1177}
1178
1179/*
1180 * Reconcile kernel and user view of the receive ring.
1181 * Same as for the txsync, this routine must be efficient.
1182 * The caller guarantees a single invocations, but races against
1183 * the rest of the driver should be handled here.
1184 *
1185 * On call, kring->rhead is the first packet that userspace wants
1186 * to keep, and kring->rcur is the wakeup point.
1187 * The kernel has previously reported packets up to kring->rtail.
1188 *
1189 * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
1190 * of whether or not we received an interrupt.
1191 */
1192static int
1193iflib_netmap_rxsync(struct netmap_kring *kring, int flags)
1194{
1195	struct netmap_adapter *na = kring->na;
1196	struct netmap_ring *ring = kring->ring;
1197	if_t ifp = na->ifp;
1198	uint32_t nm_i;	/* index into the netmap ring */
1199	uint32_t nic_i;	/* index into the NIC ring */
1200	u_int n;
1201	u_int const lim = kring->nkr_num_slots - 1;
1202	int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
1203	int i = 0, rx_bytes = 0, rx_pkts = 0;
1204
1205	if_ctx_t ctx = ifp->if_softc;
1206	if_shared_ctx_t sctx = ctx->ifc_sctx;
1207	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1208	iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id];
1209	iflib_fl_t fl = &rxq->ifr_fl[0];
1210	struct if_rxd_info ri;
1211	qidx_t *cidxp;
1212
1213	/*
1214	 * netmap only uses free list 0, to avoid out of order consumption
1215	 * of receive buffers
1216	 */
1217
1218	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
1219	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1220
1221	/*
1222	 * First part: import newly received packets.
1223	 *
1224	 * nm_i is the index of the next free slot in the netmap ring,
1225	 * nic_i is the index of the next received packet in the NIC ring
1226	 * (or in the free list 0 if IFLIB_HAS_RXCQ is set), and they may
1227	 * differ in case if_init() has been called while
1228	 * in netmap mode. For the receive ring we have
1229	 *
1230	 *	nic_i = fl->ifl_cidx;
1231	 *	nm_i = kring->nr_hwtail (previous)
1232	 * and
1233	 *	nm_i == (nic_i + kring->nkr_hwofs) % ring_size
1234	 *
1235	 * fl->ifl_cidx is set to 0 on a ring reinit
1236	 */
1237	if (netmap_no_pendintr || force_update) {
1238		uint32_t hwtail_lim = nm_prev(kring->nr_hwcur, lim);
1239		bool have_rxcq = sctx->isc_flags & IFLIB_HAS_RXCQ;
1240		int crclen = iflib_crcstrip ? 0 : 4;
1241		int error, avail;
1242
1243		/*
1244		 * For the free list consumer index, we use the same
1245		 * logic as in iflib_rxeof().
1246		 */
1247		if (have_rxcq)
1248			cidxp = &rxq->ifr_cq_cidx;
1249		else
1250			cidxp = &fl->ifl_cidx;
1251		avail = ctx->isc_rxd_available(ctx->ifc_softc,
1252		    rxq->ifr_id, *cidxp, USHRT_MAX);
1253
1254		nic_i = fl->ifl_cidx;
1255		nm_i = netmap_idx_n2k(kring, nic_i);
1256		MPASS(nm_i == kring->nr_hwtail);
1257		for (n = 0; avail > 0 && nm_i != hwtail_lim; n++, avail--) {
1258			rxd_info_zero(&ri);
1259			ri.iri_frags = rxq->ifr_frags;
1260			ri.iri_qsidx = kring->ring_id;
1261			ri.iri_ifp = ctx->ifc_ifp;
1262			ri.iri_cidx = *cidxp;
1263
1264			error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
1265			for (i = 0; i < ri.iri_nfrags; i++) {
1266				if (error) {
1267					ring->slot[nm_i].len = 0;
1268					ring->slot[nm_i].flags = 0;
1269				} else {
1270					ring->slot[nm_i].len = ri.iri_frags[i].irf_len;
1271					if (i == (ri.iri_nfrags - 1)) {
1272						ring->slot[nm_i].len -= crclen;
1273						ring->slot[nm_i].flags = 0;
1274
1275						/* Update receive counters */
1276						rx_bytes += ri.iri_len;
1277						rx_pkts++;
1278					} else
1279						ring->slot[nm_i].flags = NS_MOREFRAG;
1280				}
1281
1282				bus_dmamap_sync(fl->ifl_buf_tag,
1283				    fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD);
1284				nm_i = nm_next(nm_i, lim);
1285				fl->ifl_cidx = nic_i = nm_next(nic_i, lim);
1286			}
1287
1288			if (have_rxcq) {
1289				*cidxp = ri.iri_cidx;
1290				while (*cidxp >= scctx->isc_nrxd[0])
1291					*cidxp -= scctx->isc_nrxd[0];
1292			}
1293
1294		}
1295		if (n) { /* update the state variables */
1296			if (netmap_no_pendintr && !force_update) {
1297				/* diagnostics */
1298				iflib_rx_miss ++;
1299				iflib_rx_miss_bufs += n;
1300			}
1301			kring->nr_hwtail = nm_i;
1302		}
1303		kring->nr_kflags &= ~NKR_PENDINTR;
1304	}
1305	/*
1306	 * Second part: skip past packets that userspace has released.
1307	 * (kring->nr_hwcur to head excluded),
1308	 * and make the buffers available for reception.
1309	 * As usual nm_i is the index in the netmap ring,
1310	 * nic_i is the index in the NIC ring, and
1311	 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size
1312	 */
1313	netmap_fl_refill(rxq, kring, false);
1314
1315	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
1316	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
1317
1318	return (0);
1319}
1320
1321static void
1322iflib_netmap_intr(struct netmap_adapter *na, int onoff)
1323{
1324	if_ctx_t ctx = na->ifp->if_softc;
1325
1326	CTX_LOCK(ctx);
1327	if (onoff) {
1328		IFDI_INTR_ENABLE(ctx);
1329	} else {
1330		IFDI_INTR_DISABLE(ctx);
1331	}
1332	CTX_UNLOCK(ctx);
1333}
1334
1335static int
1336iflib_netmap_attach(if_ctx_t ctx)
1337{
1338	struct netmap_adapter na;
1339
1340	bzero(&na, sizeof(na));
1341
1342	na.ifp = ctx->ifc_ifp;
1343	na.na_flags = NAF_BDG_MAYSLEEP | NAF_MOREFRAG;
1344	MPASS(ctx->ifc_softc_ctx.isc_ntxqsets);
1345	MPASS(ctx->ifc_softc_ctx.isc_nrxqsets);
1346
1347	na.num_tx_desc = iflib_num_tx_descs(ctx);
1348	na.num_rx_desc = iflib_num_rx_descs(ctx);
1349	na.nm_txsync = iflib_netmap_txsync;
1350	na.nm_rxsync = iflib_netmap_rxsync;
1351	na.nm_register = iflib_netmap_register;
1352	na.nm_intr = iflib_netmap_intr;
1353	na.nm_config = iflib_netmap_config;
1354	na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
1355	na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
1356	return (netmap_attach(&na));
1357}
1358
1359static int
1360iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq)
1361{
1362	struct netmap_adapter *na = NA(ctx->ifc_ifp);
1363	struct netmap_slot *slot;
1364
1365	slot = netmap_reset(na, NR_TX, txq->ift_id, 0);
1366	if (slot == NULL)
1367		return (0);
1368	for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) {
1369		/*
1370		 * In netmap mode, set the map for the packet buffer.
1371		 * NOTE: Some drivers (not this one) also need to set
1372		 * the physical buffer address in the NIC ring.
1373		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
1374		 * netmap slot index, si
1375		 */
1376		int si = netmap_idx_n2k(na->tx_rings[txq->ift_id], i);
1377		netmap_load_map(na, txq->ift_buf_tag, txq->ift_sds.ifsd_map[i],
1378		    NMB(na, slot + si));
1379	}
1380	return (1);
1381}
1382
1383static int
1384iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq)
1385{
1386	struct netmap_adapter *na = NA(ctx->ifc_ifp);
1387	struct netmap_kring *kring;
1388	struct netmap_slot *slot;
1389
1390	slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0);
1391	if (slot == NULL)
1392		return (0);
1393	kring = na->rx_rings[rxq->ifr_id];
1394	netmap_fl_refill(rxq, kring, true);
1395	return (1);
1396}
1397
1398static void
1399iflib_netmap_timer(void *arg)
1400{
1401	iflib_txq_t txq = arg;
1402	if_ctx_t ctx = txq->ift_ctx;
1403
1404	/*
1405	 * Wake up the netmap application, to give it a chance to
1406	 * call txsync and reclaim more completed TX buffers.
1407	 */
1408	netmap_tx_irq(ctx->ifc_ifp, txq->ift_id);
1409}
1410
1411#define iflib_netmap_detach(ifp) netmap_detach(ifp)
1412
1413#else
1414#define iflib_netmap_txq_init(ctx, txq) (0)
1415#define iflib_netmap_rxq_init(ctx, rxq) (0)
1416#define iflib_netmap_detach(ifp)
1417#define netmap_enable_all_rings(ifp)
1418#define netmap_disable_all_rings(ifp)
1419
1420#define iflib_netmap_attach(ctx) (0)
1421#define netmap_rx_irq(ifp, qid, budget) (0)
1422#endif
1423
1424#if defined(__i386__) || defined(__amd64__)
1425static __inline void
1426prefetch(void *x)
1427{
1428	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
1429}
1430static __inline void
1431prefetch2cachelines(void *x)
1432{
1433	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
1434#if (CACHE_LINE_SIZE < 128)
1435	__asm volatile("prefetcht0 %0" :: "m" (*(((unsigned long *)x)+CACHE_LINE_SIZE/(sizeof(unsigned long)))));
1436#endif
1437}
1438#else
1439#define prefetch(x)
1440#define prefetch2cachelines(x)
1441#endif
1442
1443static void
1444iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid)
1445{
1446	iflib_fl_t fl;
1447
1448	fl = &rxq->ifr_fl[flid];
1449	iru->iru_paddrs = fl->ifl_bus_addrs;
1450	iru->iru_idxs = fl->ifl_rxd_idxs;
1451	iru->iru_qsidx = rxq->ifr_id;
1452	iru->iru_buf_size = fl->ifl_buf_size;
1453	iru->iru_flidx = fl->ifl_id;
1454}
1455
1456static void
1457_iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
1458{
1459	if (err)
1460		return;
1461	*(bus_addr_t *) arg = segs[0].ds_addr;
1462}
1463
1464int
1465iflib_dma_alloc_align(if_ctx_t ctx, int size, int align, iflib_dma_info_t dma, int mapflags)
1466{
1467	int err;
1468	device_t dev = ctx->ifc_dev;
1469
1470	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
1471				align, 0,		/* alignment, bounds */
1472				BUS_SPACE_MAXADDR,	/* lowaddr */
1473				BUS_SPACE_MAXADDR,	/* highaddr */
1474				NULL, NULL,		/* filter, filterarg */
1475				size,			/* maxsize */
1476				1,			/* nsegments */
1477				size,			/* maxsegsize */
1478				BUS_DMA_ALLOCNOW,	/* flags */
1479				NULL,			/* lockfunc */
1480				NULL,			/* lockarg */
1481				&dma->idi_tag);
1482	if (err) {
1483		device_printf(dev,
1484		    "%s: bus_dma_tag_create failed: %d\n",
1485		    __func__, err);
1486		goto fail_0;
1487	}
1488
1489	err = bus_dmamem_alloc(dma->idi_tag, (void**) &dma->idi_vaddr,
1490	    BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map);
1491	if (err) {
1492		device_printf(dev,
1493		    "%s: bus_dmamem_alloc(%ju) failed: %d\n",
1494		    __func__, (uintmax_t)size, err);
1495		goto fail_1;
1496	}
1497
1498	dma->idi_paddr = IF_BAD_DMA;
1499	err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr,
1500	    size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT);
1501	if (err || dma->idi_paddr == IF_BAD_DMA) {
1502		device_printf(dev,
1503		    "%s: bus_dmamap_load failed: %d\n",
1504		    __func__, err);
1505		goto fail_2;
1506	}
1507
1508	dma->idi_size = size;
1509	return (0);
1510
1511fail_2:
1512	bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
1513fail_1:
1514	bus_dma_tag_destroy(dma->idi_tag);
1515fail_0:
1516	dma->idi_tag = NULL;
1517
1518	return (err);
1519}
1520
1521int
1522iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags)
1523{
1524	if_shared_ctx_t sctx = ctx->ifc_sctx;
1525
1526	KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized"));
1527
1528	return (iflib_dma_alloc_align(ctx, size, sctx->isc_q_align, dma, mapflags));
1529}
1530
1531int
1532iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count)
1533{
1534	int i, err;
1535	iflib_dma_info_t *dmaiter;
1536
1537	dmaiter = dmalist;
1538	for (i = 0; i < count; i++, dmaiter++) {
1539		if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0)
1540			break;
1541	}
1542	if (err)
1543		iflib_dma_free_multi(dmalist, i);
1544	return (err);
1545}
1546
1547void
1548iflib_dma_free(iflib_dma_info_t dma)
1549{
1550	if (dma->idi_tag == NULL)
1551		return;
1552	if (dma->idi_paddr != IF_BAD_DMA) {
1553		bus_dmamap_sync(dma->idi_tag, dma->idi_map,
1554		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1555		bus_dmamap_unload(dma->idi_tag, dma->idi_map);
1556		dma->idi_paddr = IF_BAD_DMA;
1557	}
1558	if (dma->idi_vaddr != NULL) {
1559		bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
1560		dma->idi_vaddr = NULL;
1561	}
1562	bus_dma_tag_destroy(dma->idi_tag);
1563	dma->idi_tag = NULL;
1564}
1565
1566void
1567iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count)
1568{
1569	int i;
1570	iflib_dma_info_t *dmaiter = dmalist;
1571
1572	for (i = 0; i < count; i++, dmaiter++)
1573		iflib_dma_free(*dmaiter);
1574}
1575
1576static int
1577iflib_fast_intr(void *arg)
1578{
1579	iflib_filter_info_t info = arg;
1580	struct grouptask *gtask = info->ifi_task;
1581	int result;
1582
1583	DBG_COUNTER_INC(fast_intrs);
1584	if (info->ifi_filter != NULL) {
1585		result = info->ifi_filter(info->ifi_filter_arg);
1586		if ((result & FILTER_SCHEDULE_THREAD) == 0)
1587			return (result);
1588	}
1589
1590	GROUPTASK_ENQUEUE(gtask);
1591	return (FILTER_HANDLED);
1592}
1593
1594static int
1595iflib_fast_intr_rxtx(void *arg)
1596{
1597	iflib_filter_info_t info = arg;
1598	struct grouptask *gtask = info->ifi_task;
1599	if_ctx_t ctx;
1600	iflib_rxq_t rxq = (iflib_rxq_t)info->ifi_ctx;
1601	iflib_txq_t txq;
1602	void *sc;
1603	int i, cidx, result;
1604	qidx_t txqid;
1605	bool intr_enable, intr_legacy;
1606
1607	DBG_COUNTER_INC(fast_intrs);
1608	if (info->ifi_filter != NULL) {
1609		result = info->ifi_filter(info->ifi_filter_arg);
1610		if ((result & FILTER_SCHEDULE_THREAD) == 0)
1611			return (result);
1612	}
1613
1614	ctx = rxq->ifr_ctx;
1615	sc = ctx->ifc_softc;
1616	intr_enable = false;
1617	intr_legacy = !!(ctx->ifc_flags & IFC_LEGACY);
1618	MPASS(rxq->ifr_ntxqirq);
1619	for (i = 0; i < rxq->ifr_ntxqirq; i++) {
1620		txqid = rxq->ifr_txqid[i];
1621		txq = &ctx->ifc_txqs[txqid];
1622		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
1623		    BUS_DMASYNC_POSTREAD);
1624		if (!ctx->isc_txd_credits_update(sc, txqid, false)) {
1625			if (intr_legacy)
1626				intr_enable = true;
1627			else
1628				IFDI_TX_QUEUE_INTR_ENABLE(ctx, txqid);
1629			continue;
1630		}
1631		GROUPTASK_ENQUEUE(&txq->ift_task);
1632	}
1633	if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_RXCQ)
1634		cidx = rxq->ifr_cq_cidx;
1635	else
1636		cidx = rxq->ifr_fl[0].ifl_cidx;
1637	if (iflib_rxd_avail(ctx, rxq, cidx, 1))
1638		GROUPTASK_ENQUEUE(gtask);
1639	else {
1640		if (intr_legacy)
1641			intr_enable = true;
1642		else
1643			IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
1644		DBG_COUNTER_INC(rx_intr_enables);
1645	}
1646	if (intr_enable)
1647		IFDI_INTR_ENABLE(ctx);
1648	return (FILTER_HANDLED);
1649}
1650
1651static int
1652iflib_fast_intr_ctx(void *arg)
1653{
1654	iflib_filter_info_t info = arg;
1655	struct grouptask *gtask = info->ifi_task;
1656	int result;
1657
1658	DBG_COUNTER_INC(fast_intrs);
1659	if (info->ifi_filter != NULL) {
1660		result = info->ifi_filter(info->ifi_filter_arg);
1661		if ((result & FILTER_SCHEDULE_THREAD) == 0)
1662			return (result);
1663	}
1664
1665	GROUPTASK_ENQUEUE(gtask);
1666	return (FILTER_HANDLED);
1667}
1668
1669static int
1670_iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
1671		 driver_filter_t filter, driver_intr_t handler, void *arg,
1672		 const char *name)
1673{
1674	struct resource *res;
1675	void *tag = NULL;
1676	device_t dev = ctx->ifc_dev;
1677	int flags, i, rc;
1678
1679	flags = RF_ACTIVE;
1680	if (ctx->ifc_flags & IFC_LEGACY)
1681		flags |= RF_SHAREABLE;
1682	MPASS(rid < 512);
1683	i = rid;
1684	res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &i, flags);
1685	if (res == NULL) {
1686		device_printf(dev,
1687		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
1688		return (ENOMEM);
1689	}
1690	irq->ii_res = res;
1691	KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL"));
1692	rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET,
1693						filter, handler, arg, &tag);
1694	if (rc != 0) {
1695		device_printf(dev,
1696		    "failed to setup interrupt for rid %d, name %s: %d\n",
1697					  rid, name ? name : "unknown", rc);
1698		return (rc);
1699	} else if (name)
1700		bus_describe_intr(dev, res, tag, "%s", name);
1701
1702	irq->ii_tag = tag;
1703	return (0);
1704}
1705
1706/*********************************************************************
1707 *
1708 *  Allocate DMA resources for TX buffers as well as memory for the TX
1709 *  mbuf map.  TX DMA maps (non-TSO/TSO) and TX mbuf map are kept in a
1710 *  iflib_sw_tx_desc_array structure, storing all the information that
1711 *  is needed to transmit a packet on the wire.  This is called only
1712 *  once at attach, setup is done every reset.
1713 *
1714 **********************************************************************/
1715static int
1716iflib_txsd_alloc(iflib_txq_t txq)
1717{
1718	if_ctx_t ctx = txq->ift_ctx;
1719	if_shared_ctx_t sctx = ctx->ifc_sctx;
1720	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1721	device_t dev = ctx->ifc_dev;
1722	bus_size_t tsomaxsize;
1723	int err, nsegments, ntsosegments;
1724	bool tso;
1725
1726	nsegments = scctx->isc_tx_nsegments;
1727	ntsosegments = scctx->isc_tx_tso_segments_max;
1728	tsomaxsize = scctx->isc_tx_tso_size_max;
1729	if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_VLAN_MTU)
1730		tsomaxsize += sizeof(struct ether_vlan_header);
1731	MPASS(scctx->isc_ntxd[0] > 0);
1732	MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0);
1733	MPASS(nsegments > 0);
1734	if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) {
1735		MPASS(ntsosegments > 0);
1736		MPASS(sctx->isc_tso_maxsize >= tsomaxsize);
1737	}
1738
1739	/*
1740	 * Set up DMA tags for TX buffers.
1741	 */
1742	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
1743			       1, 0,			/* alignment, bounds */
1744			       BUS_SPACE_MAXADDR,	/* lowaddr */
1745			       BUS_SPACE_MAXADDR,	/* highaddr */
1746			       NULL, NULL,		/* filter, filterarg */
1747			       sctx->isc_tx_maxsize,		/* maxsize */
1748			       nsegments,	/* nsegments */
1749			       sctx->isc_tx_maxsegsize,	/* maxsegsize */
1750			       0,			/* flags */
1751			       NULL,			/* lockfunc */
1752			       NULL,			/* lockfuncarg */
1753			       &txq->ift_buf_tag))) {
1754		device_printf(dev,"Unable to allocate TX DMA tag: %d\n", err);
1755		device_printf(dev,"maxsize: %ju nsegments: %d maxsegsize: %ju\n",
1756		    (uintmax_t)sctx->isc_tx_maxsize, nsegments, (uintmax_t)sctx->isc_tx_maxsegsize);
1757		goto fail;
1758	}
1759	tso = (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) != 0;
1760	if (tso && (err = bus_dma_tag_create(bus_get_dma_tag(dev),
1761			       1, 0,			/* alignment, bounds */
1762			       BUS_SPACE_MAXADDR,	/* lowaddr */
1763			       BUS_SPACE_MAXADDR,	/* highaddr */
1764			       NULL, NULL,		/* filter, filterarg */
1765			       tsomaxsize,		/* maxsize */
1766			       ntsosegments,	/* nsegments */
1767			       sctx->isc_tso_maxsegsize,/* maxsegsize */
1768			       0,			/* flags */
1769			       NULL,			/* lockfunc */
1770			       NULL,			/* lockfuncarg */
1771			       &txq->ift_tso_buf_tag))) {
1772		device_printf(dev, "Unable to allocate TSO TX DMA tag: %d\n",
1773		    err);
1774		goto fail;
1775	}
1776
1777	/* Allocate memory for the TX mbuf map. */
1778	if (!(txq->ift_sds.ifsd_m =
1779	    (struct mbuf **) malloc(sizeof(struct mbuf *) *
1780	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1781		device_printf(dev, "Unable to allocate TX mbuf map memory\n");
1782		err = ENOMEM;
1783		goto fail;
1784	}
1785
1786	/*
1787	 * Create the DMA maps for TX buffers.
1788	 */
1789	if ((txq->ift_sds.ifsd_map = (bus_dmamap_t *)malloc(
1790	    sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
1791	    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
1792		device_printf(dev,
1793		    "Unable to allocate TX buffer DMA map memory\n");
1794		err = ENOMEM;
1795		goto fail;
1796	}
1797	if (tso && (txq->ift_sds.ifsd_tso_map = (bus_dmamap_t *)malloc(
1798	    sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
1799	    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
1800		device_printf(dev,
1801		    "Unable to allocate TSO TX buffer map memory\n");
1802		err = ENOMEM;
1803		goto fail;
1804	}
1805	for (int i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) {
1806		err = bus_dmamap_create(txq->ift_buf_tag, 0,
1807		    &txq->ift_sds.ifsd_map[i]);
1808		if (err != 0) {
1809			device_printf(dev, "Unable to create TX DMA map\n");
1810			goto fail;
1811		}
1812		if (!tso)
1813			continue;
1814		err = bus_dmamap_create(txq->ift_tso_buf_tag, 0,
1815		    &txq->ift_sds.ifsd_tso_map[i]);
1816		if (err != 0) {
1817			device_printf(dev, "Unable to create TSO TX DMA map\n");
1818			goto fail;
1819		}
1820	}
1821	return (0);
1822fail:
1823	/* We free all, it handles case where we are in the middle */
1824	iflib_tx_structures_free(ctx);
1825	return (err);
1826}
1827
1828static void
1829iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i)
1830{
1831	bus_dmamap_t map;
1832
1833	if (txq->ift_sds.ifsd_map != NULL) {
1834		map = txq->ift_sds.ifsd_map[i];
1835		bus_dmamap_sync(txq->ift_buf_tag, map, BUS_DMASYNC_POSTWRITE);
1836		bus_dmamap_unload(txq->ift_buf_tag, map);
1837		bus_dmamap_destroy(txq->ift_buf_tag, map);
1838		txq->ift_sds.ifsd_map[i] = NULL;
1839	}
1840
1841	if (txq->ift_sds.ifsd_tso_map != NULL) {
1842		map = txq->ift_sds.ifsd_tso_map[i];
1843		bus_dmamap_sync(txq->ift_tso_buf_tag, map,
1844		    BUS_DMASYNC_POSTWRITE);
1845		bus_dmamap_unload(txq->ift_tso_buf_tag, map);
1846		bus_dmamap_destroy(txq->ift_tso_buf_tag, map);
1847		txq->ift_sds.ifsd_tso_map[i] = NULL;
1848	}
1849}
1850
1851static void
1852iflib_txq_destroy(iflib_txq_t txq)
1853{
1854	if_ctx_t ctx = txq->ift_ctx;
1855
1856	for (int i = 0; i < txq->ift_size; i++)
1857		iflib_txsd_destroy(ctx, txq, i);
1858
1859	if (txq->ift_br != NULL) {
1860		ifmp_ring_free(txq->ift_br);
1861		txq->ift_br = NULL;
1862	}
1863
1864	mtx_destroy(&txq->ift_mtx);
1865
1866	if (txq->ift_sds.ifsd_map != NULL) {
1867		free(txq->ift_sds.ifsd_map, M_IFLIB);
1868		txq->ift_sds.ifsd_map = NULL;
1869	}
1870	if (txq->ift_sds.ifsd_tso_map != NULL) {
1871		free(txq->ift_sds.ifsd_tso_map, M_IFLIB);
1872		txq->ift_sds.ifsd_tso_map = NULL;
1873	}
1874	if (txq->ift_sds.ifsd_m != NULL) {
1875		free(txq->ift_sds.ifsd_m, M_IFLIB);
1876		txq->ift_sds.ifsd_m = NULL;
1877	}
1878	if (txq->ift_buf_tag != NULL) {
1879		bus_dma_tag_destroy(txq->ift_buf_tag);
1880		txq->ift_buf_tag = NULL;
1881	}
1882	if (txq->ift_tso_buf_tag != NULL) {
1883		bus_dma_tag_destroy(txq->ift_tso_buf_tag);
1884		txq->ift_tso_buf_tag = NULL;
1885	}
1886	if (txq->ift_ifdi != NULL) {
1887		free(txq->ift_ifdi, M_IFLIB);
1888	}
1889}
1890
1891static void
1892iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i)
1893{
1894	struct mbuf **mp;
1895
1896	mp = &txq->ift_sds.ifsd_m[i];
1897	if (*mp == NULL)
1898		return;
1899
1900	if (txq->ift_sds.ifsd_map != NULL) {
1901		bus_dmamap_sync(txq->ift_buf_tag,
1902		    txq->ift_sds.ifsd_map[i], BUS_DMASYNC_POSTWRITE);
1903		bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[i]);
1904	}
1905	if (txq->ift_sds.ifsd_tso_map != NULL) {
1906		bus_dmamap_sync(txq->ift_tso_buf_tag,
1907		    txq->ift_sds.ifsd_tso_map[i], BUS_DMASYNC_POSTWRITE);
1908		bus_dmamap_unload(txq->ift_tso_buf_tag,
1909		    txq->ift_sds.ifsd_tso_map[i]);
1910	}
1911	m_freem(*mp);
1912	DBG_COUNTER_INC(tx_frees);
1913	*mp = NULL;
1914}
1915
1916static int
1917iflib_txq_setup(iflib_txq_t txq)
1918{
1919	if_ctx_t ctx = txq->ift_ctx;
1920	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1921	if_shared_ctx_t sctx = ctx->ifc_sctx;
1922	iflib_dma_info_t di;
1923	int i;
1924
1925	/* Set number of descriptors available */
1926	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
1927	/* XXX make configurable */
1928	txq->ift_update_freq = IFLIB_DEFAULT_TX_UPDATE_FREQ;
1929
1930	/* Reset indices */
1931	txq->ift_cidx_processed = 0;
1932	txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
1933	txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset];
1934
1935	for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
1936		bzero((void *)di->idi_vaddr, di->idi_size);
1937
1938	IFDI_TXQ_SETUP(ctx, txq->ift_id);
1939	for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
1940		bus_dmamap_sync(di->idi_tag, di->idi_map,
1941		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1942	return (0);
1943}
1944
1945/*********************************************************************
1946 *
1947 *  Allocate DMA resources for RX buffers as well as memory for the RX
1948 *  mbuf map, direct RX cluster pointer map and RX cluster bus address
1949 *  map.  RX DMA map, RX mbuf map, direct RX cluster pointer map and
1950 *  RX cluster map are kept in a iflib_sw_rx_desc_array structure.
1951 *  Since we use use one entry in iflib_sw_rx_desc_array per received
1952 *  packet, the maximum number of entries we'll need is equal to the
1953 *  number of hardware receive descriptors that we've allocated.
1954 *
1955 **********************************************************************/
1956static int
1957iflib_rxsd_alloc(iflib_rxq_t rxq)
1958{
1959	if_ctx_t ctx = rxq->ifr_ctx;
1960	if_shared_ctx_t sctx = ctx->ifc_sctx;
1961	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1962	device_t dev = ctx->ifc_dev;
1963	iflib_fl_t fl;
1964	int			err;
1965
1966	MPASS(scctx->isc_nrxd[0] > 0);
1967	MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0);
1968
1969	fl = rxq->ifr_fl;
1970	for (int i = 0; i <  rxq->ifr_nfl; i++, fl++) {
1971		fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */
1972		/* Set up DMA tag for RX buffers. */
1973		err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
1974					 1, 0,			/* alignment, bounds */
1975					 BUS_SPACE_MAXADDR,	/* lowaddr */
1976					 BUS_SPACE_MAXADDR,	/* highaddr */
1977					 NULL, NULL,		/* filter, filterarg */
1978					 sctx->isc_rx_maxsize,	/* maxsize */
1979					 sctx->isc_rx_nsegments,	/* nsegments */
1980					 sctx->isc_rx_maxsegsize,	/* maxsegsize */
1981					 0,			/* flags */
1982					 NULL,			/* lockfunc */
1983					 NULL,			/* lockarg */
1984					 &fl->ifl_buf_tag);
1985		if (err) {
1986			device_printf(dev,
1987			    "Unable to allocate RX DMA tag: %d\n", err);
1988			goto fail;
1989		}
1990
1991		/* Allocate memory for the RX mbuf map. */
1992		if (!(fl->ifl_sds.ifsd_m =
1993		      (struct mbuf **) malloc(sizeof(struct mbuf *) *
1994					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1995			device_printf(dev,
1996			    "Unable to allocate RX mbuf map memory\n");
1997			err = ENOMEM;
1998			goto fail;
1999		}
2000
2001		/* Allocate memory for the direct RX cluster pointer map. */
2002		if (!(fl->ifl_sds.ifsd_cl =
2003		      (caddr_t *) malloc(sizeof(caddr_t) *
2004					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
2005			device_printf(dev,
2006			    "Unable to allocate RX cluster map memory\n");
2007			err = ENOMEM;
2008			goto fail;
2009		}
2010
2011		/* Allocate memory for the RX cluster bus address map. */
2012		if (!(fl->ifl_sds.ifsd_ba =
2013		      (bus_addr_t *) malloc(sizeof(bus_addr_t) *
2014					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
2015			device_printf(dev,
2016			    "Unable to allocate RX bus address map memory\n");
2017			err = ENOMEM;
2018			goto fail;
2019		}
2020
2021		/*
2022		 * Create the DMA maps for RX buffers.
2023		 */
2024		if (!(fl->ifl_sds.ifsd_map =
2025		      (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
2026			device_printf(dev,
2027			    "Unable to allocate RX buffer DMA map memory\n");
2028			err = ENOMEM;
2029			goto fail;
2030		}
2031		for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++) {
2032			err = bus_dmamap_create(fl->ifl_buf_tag, 0,
2033			    &fl->ifl_sds.ifsd_map[i]);
2034			if (err != 0) {
2035				device_printf(dev, "Unable to create RX buffer DMA map\n");
2036				goto fail;
2037			}
2038		}
2039	}
2040	return (0);
2041
2042fail:
2043	iflib_rx_structures_free(ctx);
2044	return (err);
2045}
2046
2047/*
2048 * Internal service routines
2049 */
2050
2051struct rxq_refill_cb_arg {
2052	int               error;
2053	bus_dma_segment_t seg;
2054	int               nseg;
2055};
2056
2057static void
2058_rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
2059{
2060	struct rxq_refill_cb_arg *cb_arg = arg;
2061
2062	cb_arg->error = error;
2063	cb_arg->seg = segs[0];
2064	cb_arg->nseg = nseg;
2065}
2066
2067/**
2068 * iflib_fl_refill - refill an rxq free-buffer list
2069 * @ctx: the iflib context
2070 * @fl: the free list to refill
2071 * @count: the number of new buffers to allocate
2072 *
2073 * (Re)populate an rxq free-buffer list with up to @count new packet buffers.
2074 * The caller must assure that @count does not exceed the queue's capacity
2075 * minus one (since we always leave a descriptor unavailable).
2076 */
2077static uint8_t
2078iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count)
2079{
2080	struct if_rxd_update iru;
2081	struct rxq_refill_cb_arg cb_arg;
2082	struct mbuf *m;
2083	caddr_t cl, *sd_cl;
2084	struct mbuf **sd_m;
2085	bus_dmamap_t *sd_map;
2086	bus_addr_t bus_addr, *sd_ba;
2087	int err, frag_idx, i, idx, n, pidx;
2088	qidx_t credits;
2089
2090	MPASS(count <= fl->ifl_size - fl->ifl_credits - 1);
2091
2092	sd_m = fl->ifl_sds.ifsd_m;
2093	sd_map = fl->ifl_sds.ifsd_map;
2094	sd_cl = fl->ifl_sds.ifsd_cl;
2095	sd_ba = fl->ifl_sds.ifsd_ba;
2096	pidx = fl->ifl_pidx;
2097	idx = pidx;
2098	frag_idx = fl->ifl_fragidx;
2099	credits = fl->ifl_credits;
2100
2101	i = 0;
2102	n = count;
2103	MPASS(n > 0);
2104	MPASS(credits + n <= fl->ifl_size);
2105
2106	if (pidx < fl->ifl_cidx)
2107		MPASS(pidx + n <= fl->ifl_cidx);
2108	if (pidx == fl->ifl_cidx && (credits < fl->ifl_size))
2109		MPASS(fl->ifl_gen == 0);
2110	if (pidx > fl->ifl_cidx)
2111		MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx);
2112
2113	DBG_COUNTER_INC(fl_refills);
2114	if (n > 8)
2115		DBG_COUNTER_INC(fl_refills_large);
2116	iru_init(&iru, fl->ifl_rxq, fl->ifl_id);
2117	while (n-- > 0) {
2118		/*
2119		 * We allocate an uninitialized mbuf + cluster, mbuf is
2120		 * initialized after rx.
2121		 *
2122		 * If the cluster is still set then we know a minimum sized
2123		 * packet was received
2124		 */
2125		bit_ffc_at(fl->ifl_rx_bitmap, frag_idx, fl->ifl_size,
2126		    &frag_idx);
2127		if (frag_idx < 0)
2128			bit_ffc(fl->ifl_rx_bitmap, fl->ifl_size, &frag_idx);
2129		MPASS(frag_idx >= 0);
2130		if ((cl = sd_cl[frag_idx]) == NULL) {
2131#ifndef __HAIKU__
2132			cl = uma_zalloc(fl->ifl_zone, M_NOWAIT);
2133			if (__predict_false(cl == NULL))
2134#else
2135			if ((cl = m_cljget(NULL, M_NOWAIT, fl->ifl_buf_size)) == NULL)
2136#endif
2137				break;
2138
2139			cb_arg.error = 0;
2140			MPASS(sd_map != NULL);
2141			err = bus_dmamap_load(fl->ifl_buf_tag, sd_map[frag_idx],
2142			    cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg,
2143			    BUS_DMA_NOWAIT);
2144			if (__predict_false(err != 0 || cb_arg.error)) {
2145#ifndef __HAIKU__
2146				uma_zfree(fl->ifl_zone, cl);
2147#else
2148				m_free(cl);
2149#endif
2150				break;
2151			}
2152
2153			sd_ba[frag_idx] = bus_addr = cb_arg.seg.ds_addr;
2154			sd_cl[frag_idx] = cl;
2155#if MEMORY_LOGGING
2156			fl->ifl_cl_enqueued++;
2157#endif
2158		} else {
2159			bus_addr = sd_ba[frag_idx];
2160		}
2161		bus_dmamap_sync(fl->ifl_buf_tag, sd_map[frag_idx],
2162		    BUS_DMASYNC_PREREAD);
2163
2164		if (sd_m[frag_idx] == NULL) {
2165			m = m_gethdr(M_NOWAIT, MT_NOINIT);
2166			if (__predict_false(m == NULL))
2167				break;
2168			sd_m[frag_idx] = m;
2169		}
2170		bit_set(fl->ifl_rx_bitmap, frag_idx);
2171#if MEMORY_LOGGING
2172		fl->ifl_m_enqueued++;
2173#endif
2174
2175		DBG_COUNTER_INC(rx_allocs);
2176		fl->ifl_rxd_idxs[i] = frag_idx;
2177		fl->ifl_bus_addrs[i] = bus_addr;
2178		credits++;
2179		i++;
2180		MPASS(credits <= fl->ifl_size);
2181		if (++idx == fl->ifl_size) {
2182#ifdef INVARIANTS
2183			fl->ifl_gen = 1;
2184#endif
2185			idx = 0;
2186		}
2187		if (n == 0 || i == IFLIB_MAX_RX_REFRESH) {
2188			iru.iru_pidx = pidx;
2189			iru.iru_count = i;
2190			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
2191			fl->ifl_pidx = idx;
2192			fl->ifl_credits = credits;
2193			pidx = idx;
2194			i = 0;
2195		}
2196	}
2197
2198	if (n < count - 1) {
2199		if (i != 0) {
2200			iru.iru_pidx = pidx;
2201			iru.iru_count = i;
2202			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
2203			fl->ifl_pidx = idx;
2204			fl->ifl_credits = credits;
2205		}
2206		DBG_COUNTER_INC(rxd_flush);
2207		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
2208		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
2209		ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id,
2210		    fl->ifl_id, fl->ifl_pidx);
2211		if (__predict_true(bit_test(fl->ifl_rx_bitmap, frag_idx))) {
2212			fl->ifl_fragidx = frag_idx + 1;
2213			if (fl->ifl_fragidx == fl->ifl_size)
2214				fl->ifl_fragidx = 0;
2215		} else {
2216			fl->ifl_fragidx = frag_idx;
2217		}
2218	}
2219
2220	return (n == -1 ? 0 : IFLIB_RXEOF_EMPTY);
2221}
2222
2223static inline uint8_t
2224iflib_fl_refill_all(if_ctx_t ctx, iflib_fl_t fl)
2225{
2226	/*
2227	 * We leave an unused descriptor to avoid pidx to catch up with cidx.
2228	 * This is important as it confuses most NICs. For instance,
2229	 * Intel NICs have (per receive ring) RDH and RDT registers, where
2230	 * RDH points to the next receive descriptor to be used by the NIC,
2231	 * and RDT for the next receive descriptor to be published by the
2232	 * driver to the NIC (RDT - 1 is thus the last valid one).
2233	 * The condition RDH == RDT means no descriptors are available to
2234	 * the NIC, and thus it would be ambiguous if it also meant that
2235	 * all the descriptors are available to the NIC.
2236	 */
2237	int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1;
2238#ifdef INVARIANTS
2239	int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1;
2240#endif
2241
2242	MPASS(fl->ifl_credits <= fl->ifl_size);
2243	MPASS(reclaimable == delta);
2244
2245	if (reclaimable > 0)
2246		return (iflib_fl_refill(ctx, fl, reclaimable));
2247	return (0);
2248}
2249
2250uint8_t
2251iflib_in_detach(if_ctx_t ctx)
2252{
2253	bool in_detach;
2254
2255	STATE_LOCK(ctx);
2256	in_detach = !!(ctx->ifc_flags & IFC_IN_DETACH);
2257	STATE_UNLOCK(ctx);
2258	return (in_detach);
2259}
2260
2261static void
2262iflib_fl_bufs_free(iflib_fl_t fl)
2263{
2264	iflib_dma_info_t idi = fl->ifl_ifdi;
2265	bus_dmamap_t sd_map;
2266	uint32_t i;
2267
2268	for (i = 0; i < fl->ifl_size; i++) {
2269		struct mbuf **sd_m = &fl->ifl_sds.ifsd_m[i];
2270		caddr_t *sd_cl = &fl->ifl_sds.ifsd_cl[i];
2271
2272		if (*sd_cl != NULL) {
2273			sd_map = fl->ifl_sds.ifsd_map[i];
2274			bus_dmamap_sync(fl->ifl_buf_tag, sd_map,
2275			    BUS_DMASYNC_POSTREAD);
2276			bus_dmamap_unload(fl->ifl_buf_tag, sd_map);
2277#ifndef __HAIKU__
2278			uma_zfree(fl->ifl_zone, *sd_cl);
2279#else
2280			struct mbuf* mb = m_get(0, MT_DATA);
2281			m_cljset(mb, *sd_cl, fl->ifl_cltype);
2282			m_free(mb);
2283#endif
2284			*sd_cl = NULL;
2285			if (*sd_m != NULL) {
2286				m_init(*sd_m, M_NOWAIT, MT_DATA, 0);
2287#ifndef __HAIKU__
2288				m_free_raw(*sd_m);
2289#else
2290				m_free(*sd_m);
2291#endif
2292				*sd_m = NULL;
2293			}
2294		} else {
2295			MPASS(*sd_m == NULL);
2296		}
2297#if MEMORY_LOGGING
2298		fl->ifl_m_dequeued++;
2299		fl->ifl_cl_dequeued++;
2300#endif
2301	}
2302#ifdef INVARIANTS
2303	for (i = 0; i < fl->ifl_size; i++) {
2304		MPASS(fl->ifl_sds.ifsd_cl[i] == NULL);
2305		MPASS(fl->ifl_sds.ifsd_m[i] == NULL);
2306	}
2307#endif
2308	/*
2309	 * Reset free list values
2310	 */
2311	fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = fl->ifl_fragidx = 0;
2312	bzero(idi->idi_vaddr, idi->idi_size);
2313}
2314
2315/*********************************************************************
2316 *
2317 *  Initialize a free list and its buffers.
2318 *
2319 **********************************************************************/
2320static int
2321iflib_fl_setup(iflib_fl_t fl)
2322{
2323	iflib_rxq_t rxq = fl->ifl_rxq;
2324	if_ctx_t ctx = rxq->ifr_ctx;
2325	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2326	int qidx;
2327
2328	bit_nclear(fl->ifl_rx_bitmap, 0, fl->ifl_size - 1);
2329	/*
2330	** Free current RX buffer structs and their mbufs
2331	*/
2332	iflib_fl_bufs_free(fl);
2333	/* Now replenish the mbufs */
2334	MPASS(fl->ifl_credits == 0);
2335	qidx = rxq->ifr_fl_offset + fl->ifl_id;
2336	if (scctx->isc_rxd_buf_size[qidx] != 0)
2337		fl->ifl_buf_size = scctx->isc_rxd_buf_size[qidx];
2338	else
2339		fl->ifl_buf_size = ctx->ifc_rx_mbuf_sz;
2340	/*
2341	 * ifl_buf_size may be a driver-supplied value, so pull it up
2342	 * to the selected mbuf size.
2343	 */
2344	fl->ifl_buf_size = iflib_get_mbuf_size_for(fl->ifl_buf_size);
2345	if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size)
2346		ctx->ifc_max_fl_buf_size = fl->ifl_buf_size;
2347	fl->ifl_cltype = m_gettype(fl->ifl_buf_size);
2348#ifndef __HAIKU__
2349	fl->ifl_zone = m_getzone(fl->ifl_buf_size);
2350#endif
2351
2352	/*
2353	 * Avoid pre-allocating zillions of clusters to an idle card
2354	 * potentially speeding up attach. In any case make sure
2355	 * to leave a descriptor unavailable. See the comment in
2356	 * iflib_fl_refill_all().
2357	 */
2358	MPASS(fl->ifl_size > 0);
2359	(void)iflib_fl_refill(ctx, fl, min(128, fl->ifl_size - 1));
2360	if (min(128, fl->ifl_size - 1) != fl->ifl_credits)
2361		return (ENOBUFS);
2362	/*
2363	 * handle failure
2364	 */
2365	MPASS(rxq != NULL);
2366	MPASS(fl->ifl_ifdi != NULL);
2367	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
2368	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
2369	return (0);
2370}
2371
2372/*********************************************************************
2373 *
2374 *  Free receive ring data structures
2375 *
2376 **********************************************************************/
2377static void
2378iflib_rx_sds_free(iflib_rxq_t rxq)
2379{
2380	iflib_fl_t fl;
2381	int i, j;
2382
2383	if (rxq->ifr_fl != NULL) {
2384		for (i = 0; i < rxq->ifr_nfl; i++) {
2385			fl = &rxq->ifr_fl[i];
2386			if (fl->ifl_buf_tag != NULL) {
2387				if (fl->ifl_sds.ifsd_map != NULL) {
2388					for (j = 0; j < fl->ifl_size; j++) {
2389						bus_dmamap_sync(
2390						    fl->ifl_buf_tag,
2391						    fl->ifl_sds.ifsd_map[j],
2392						    BUS_DMASYNC_POSTREAD);
2393						bus_dmamap_unload(
2394						    fl->ifl_buf_tag,
2395						    fl->ifl_sds.ifsd_map[j]);
2396						bus_dmamap_destroy(
2397						    fl->ifl_buf_tag,
2398						    fl->ifl_sds.ifsd_map[j]);
2399					}
2400				}
2401				bus_dma_tag_destroy(fl->ifl_buf_tag);
2402				fl->ifl_buf_tag = NULL;
2403			}
2404			free(fl->ifl_sds.ifsd_m, M_IFLIB);
2405			free(fl->ifl_sds.ifsd_cl, M_IFLIB);
2406			free(fl->ifl_sds.ifsd_ba, M_IFLIB);
2407			free(fl->ifl_sds.ifsd_map, M_IFLIB);
2408			free(fl->ifl_rx_bitmap, M_IFLIB);
2409			fl->ifl_sds.ifsd_m = NULL;
2410			fl->ifl_sds.ifsd_cl = NULL;
2411			fl->ifl_sds.ifsd_ba = NULL;
2412			fl->ifl_sds.ifsd_map = NULL;
2413			fl->ifl_rx_bitmap = NULL;
2414		}
2415		free(rxq->ifr_fl, M_IFLIB);
2416		rxq->ifr_fl = NULL;
2417		free(rxq->ifr_ifdi, M_IFLIB);
2418		rxq->ifr_ifdi = NULL;
2419		rxq->ifr_cq_cidx = 0;
2420	}
2421}
2422
2423/*
2424 * Timer routine
2425 */
2426static void
2427iflib_timer(void *arg)
2428{
2429	iflib_txq_t txq = arg;
2430	if_ctx_t ctx = txq->ift_ctx;
2431	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
2432	uint64_t this_tick = ticks;
2433
2434	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
2435		return;
2436
2437	/*
2438	** Check on the state of the TX queue(s), this
2439	** can be done without the lock because its RO
2440	** and the HUNG state will be static if set.
2441	*/
2442	if (this_tick - txq->ift_last_timer_tick >= iflib_timer_default) {
2443		txq->ift_last_timer_tick = this_tick;
2444		IFDI_TIMER(ctx, txq->ift_id);
2445		if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) &&
2446		    ((txq->ift_cleaned_prev == txq->ift_cleaned) ||
2447		     (sctx->isc_pause_frames == 0)))
2448			goto hung;
2449
2450		if (txq->ift_qstatus != IFLIB_QUEUE_IDLE &&
2451		    ifmp_ring_is_stalled(txq->ift_br)) {
2452			KASSERT(ctx->ifc_link_state == LINK_STATE_UP,
2453			    ("queue can't be marked as hung if interface is down"));
2454			txq->ift_qstatus = IFLIB_QUEUE_HUNG;
2455		}
2456		txq->ift_cleaned_prev = txq->ift_cleaned;
2457	}
2458	/* handle any laggards */
2459	if (txq->ift_db_pending)
2460		GROUPTASK_ENQUEUE(&txq->ift_task);
2461
2462	sctx->isc_pause_frames = 0;
2463	if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)
2464		callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer,
2465		    txq, txq->ift_timer.c_cpu);
2466	return;
2467
2468 hung:
2469	device_printf(ctx->ifc_dev,
2470	    "Watchdog timeout (TX: %d desc avail: %d pidx: %d) -- resetting\n",
2471	    txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx);
2472	STATE_LOCK(ctx);
2473	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
2474	ctx->ifc_flags |= (IFC_DO_WATCHDOG|IFC_DO_RESET);
2475	iflib_admin_intr_deferred(ctx);
2476	STATE_UNLOCK(ctx);
2477}
2478
2479static uint16_t
2480iflib_get_mbuf_size_for(unsigned int size)
2481{
2482
2483	if (size <= MCLBYTES)
2484		return (MCLBYTES);
2485	else
2486		return (MJUMPAGESIZE);
2487}
2488
2489static void
2490iflib_calc_rx_mbuf_sz(if_ctx_t ctx)
2491{
2492	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
2493
2494	/*
2495	 * XXX don't set the max_frame_size to larger
2496	 * than the hardware can handle
2497	 */
2498	ctx->ifc_rx_mbuf_sz =
2499	    iflib_get_mbuf_size_for(sctx->isc_max_frame_size);
2500}
2501
2502uint32_t
2503iflib_get_rx_mbuf_sz(if_ctx_t ctx)
2504{
2505
2506	return (ctx->ifc_rx_mbuf_sz);
2507}
2508
2509static void
2510iflib_init_locked(if_ctx_t ctx)
2511{
2512	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
2513	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2514	if_t ifp = ctx->ifc_ifp;
2515	iflib_fl_t fl;
2516	iflib_txq_t txq;
2517	iflib_rxq_t rxq;
2518	int i, j, tx_ip_csum_flags, tx_ip6_csum_flags;
2519
2520	if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
2521	IFDI_INTR_DISABLE(ctx);
2522
2523	/*
2524	 * See iflib_stop(). Useful in case iflib_init_locked() is
2525	 * called without first calling iflib_stop().
2526	 */
2527	netmap_disable_all_rings(ifp);
2528
2529	tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP);
2530	tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP);
2531	/* Set hardware offload abilities */
2532	if_clearhwassist(ifp);
2533	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
2534		if_sethwassistbits(ifp, tx_ip_csum_flags, 0);
2535	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
2536		if_sethwassistbits(ifp,  tx_ip6_csum_flags, 0);
2537	if (if_getcapenable(ifp) & IFCAP_TSO4)
2538		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
2539	if (if_getcapenable(ifp) & IFCAP_TSO6)
2540		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
2541
2542	for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) {
2543		CALLOUT_LOCK(txq);
2544		callout_stop(&txq->ift_timer);
2545#ifdef DEV_NETMAP
2546		callout_stop(&txq->ift_netmap_timer);
2547#endif /* DEV_NETMAP */
2548		CALLOUT_UNLOCK(txq);
2549		iflib_netmap_txq_init(ctx, txq);
2550	}
2551
2552	/*
2553	 * Calculate a suitable Rx mbuf size prior to calling IFDI_INIT, so
2554	 * that drivers can use the value when setting up the hardware receive
2555	 * buffers.
2556	 */
2557	iflib_calc_rx_mbuf_sz(ctx);
2558
2559#ifdef INVARIANTS
2560	i = if_getdrvflags(ifp);
2561#endif
2562	IFDI_INIT(ctx);
2563	MPASS(if_getdrvflags(ifp) == i);
2564	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
2565		if (iflib_netmap_rxq_init(ctx, rxq) > 0) {
2566			/* This rxq is in netmap mode. Skip normal init. */
2567			continue;
2568		}
2569		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
2570			if (iflib_fl_setup(fl)) {
2571				device_printf(ctx->ifc_dev,
2572				    "setting up free list %d failed - "
2573				    "check cluster settings\n", j);
2574				goto done;
2575			}
2576		}
2577	}
2578done:
2579	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
2580	IFDI_INTR_ENABLE(ctx);
2581	txq = ctx->ifc_txqs;
2582	for (i = 0; i < sctx->isc_ntxqsets; i++, txq++)
2583		callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, txq,
2584			txq->ift_timer.c_cpu);
2585
2586        /* Re-enable txsync/rxsync. */
2587	netmap_enable_all_rings(ifp);
2588}
2589
2590static int
2591iflib_media_change(if_t ifp)
2592{
2593	if_ctx_t ctx = if_getsoftc(ifp);
2594	int err;
2595
2596	CTX_LOCK(ctx);
2597	if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0)
2598		iflib_if_init_locked(ctx);
2599	CTX_UNLOCK(ctx);
2600	return (err);
2601}
2602
2603static void
2604iflib_media_status(if_t ifp, struct ifmediareq *ifmr)
2605{
2606	if_ctx_t ctx = if_getsoftc(ifp);
2607
2608	CTX_LOCK(ctx);
2609	IFDI_UPDATE_ADMIN_STATUS(ctx);
2610	IFDI_MEDIA_STATUS(ctx, ifmr);
2611	CTX_UNLOCK(ctx);
2612}
2613
2614void
2615iflib_stop(if_ctx_t ctx)
2616{
2617	iflib_txq_t txq = ctx->ifc_txqs;
2618	iflib_rxq_t rxq = ctx->ifc_rxqs;
2619	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2620	if_shared_ctx_t sctx = ctx->ifc_sctx;
2621	iflib_dma_info_t di;
2622	iflib_fl_t fl;
2623	int i, j;
2624
2625	/* Tell the stack that the interface is no longer active */
2626	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
2627
2628	IFDI_INTR_DISABLE(ctx);
2629	DELAY(1000);
2630	IFDI_STOP(ctx);
2631	DELAY(1000);
2632
2633	/*
2634	 * Stop any pending txsync/rxsync and prevent new ones
2635	 * form starting. Processes blocked in poll() will get
2636	 * POLLERR.
2637	 */
2638	netmap_disable_all_rings(ctx->ifc_ifp);
2639
2640	iflib_debug_reset();
2641	/* Wait for current tx queue users to exit to disarm watchdog timer. */
2642	for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) {
2643		/* make sure all transmitters have completed before proceeding XXX */
2644
2645		CALLOUT_LOCK(txq);
2646		callout_stop(&txq->ift_timer);
2647#ifdef DEV_NETMAP
2648		callout_stop(&txq->ift_netmap_timer);
2649#endif /* DEV_NETMAP */
2650		CALLOUT_UNLOCK(txq);
2651
2652		/* clean any enqueued buffers */
2653		iflib_ifmp_purge(txq);
2654		/* Free any existing tx buffers. */
2655		for (j = 0; j < txq->ift_size; j++) {
2656			iflib_txsd_free(ctx, txq, j);
2657		}
2658		txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0;
2659		txq->ift_in_use = txq->ift_gen = txq->ift_cidx = txq->ift_pidx = txq->ift_no_desc_avail = 0;
2660		txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0;
2661		txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0;
2662		txq->ift_pullups = 0;
2663		ifmp_ring_reset_stats(txq->ift_br);
2664		for (j = 0, di = txq->ift_ifdi; j < sctx->isc_ntxqs; j++, di++)
2665			bzero((void *)di->idi_vaddr, di->idi_size);
2666	}
2667	for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) {
2668		/* make sure all transmitters have completed before proceeding XXX */
2669
2670		rxq->ifr_cq_cidx = 0;
2671		for (j = 0, di = rxq->ifr_ifdi; j < sctx->isc_nrxqs; j++, di++)
2672			bzero((void *)di->idi_vaddr, di->idi_size);
2673		/* also resets the free lists pidx/cidx */
2674		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
2675			iflib_fl_bufs_free(fl);
2676	}
2677}
2678
2679static inline caddr_t
2680calc_next_rxd(iflib_fl_t fl, int cidx)
2681{
2682	qidx_t size;
2683	int nrxd;
2684	caddr_t start, end, cur, next;
2685
2686	nrxd = fl->ifl_size;
2687	size = fl->ifl_rxd_size;
2688	start = fl->ifl_ifdi->idi_vaddr;
2689
2690	if (__predict_false(size == 0))
2691		return (start);
2692	cur = start + size*cidx;
2693	end = start + size*nrxd;
2694	next = CACHE_PTR_NEXT(cur);
2695	return (next < end ? next : start);
2696}
2697
2698static inline void
2699prefetch_pkts(iflib_fl_t fl, int cidx)
2700{
2701	int nextptr;
2702	int nrxd = fl->ifl_size;
2703	caddr_t next_rxd;
2704
2705	nextptr = (cidx + CACHE_PTR_INCREMENT) & (nrxd-1);
2706	prefetch(&fl->ifl_sds.ifsd_m[nextptr]);
2707	prefetch(&fl->ifl_sds.ifsd_cl[nextptr]);
2708	next_rxd = calc_next_rxd(fl, cidx);
2709	prefetch(next_rxd);
2710	prefetch(fl->ifl_sds.ifsd_m[(cidx + 1) & (nrxd-1)]);
2711	prefetch(fl->ifl_sds.ifsd_m[(cidx + 2) & (nrxd-1)]);
2712	prefetch(fl->ifl_sds.ifsd_m[(cidx + 3) & (nrxd-1)]);
2713	prefetch(fl->ifl_sds.ifsd_m[(cidx + 4) & (nrxd-1)]);
2714	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 1) & (nrxd-1)]);
2715	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 2) & (nrxd-1)]);
2716	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 3) & (nrxd-1)]);
2717	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]);
2718}
2719
2720static struct mbuf *
2721rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, bool unload, if_rxsd_t sd,
2722    int *pf_rv, if_rxd_info_t ri)
2723{
2724	bus_dmamap_t map;
2725	iflib_fl_t fl;
2726	caddr_t payload;
2727	struct mbuf *m;
2728	int flid, cidx, len, next;
2729
2730	map = NULL;
2731	flid = irf->irf_flid;
2732	cidx = irf->irf_idx;
2733	fl = &rxq->ifr_fl[flid];
2734	sd->ifsd_fl = fl;
2735	m = fl->ifl_sds.ifsd_m[cidx];
2736	sd->ifsd_cl = &fl->ifl_sds.ifsd_cl[cidx];
2737	fl->ifl_credits--;
2738#if MEMORY_LOGGING
2739	fl->ifl_m_dequeued++;
2740#endif
2741	if (rxq->ifr_ctx->ifc_flags & IFC_PREFETCH)
2742		prefetch_pkts(fl, cidx);
2743	next = (cidx + CACHE_PTR_INCREMENT) & (fl->ifl_size-1);
2744	prefetch(&fl->ifl_sds.ifsd_map[next]);
2745	map = fl->ifl_sds.ifsd_map[cidx];
2746
2747	bus_dmamap_sync(fl->ifl_buf_tag, map, BUS_DMASYNC_POSTREAD);
2748
2749#ifndef __HAIKU__
2750	if (rxq->pfil != NULL && PFIL_HOOKED_IN(rxq->pfil) && pf_rv != NULL &&
2751	    irf->irf_len != 0) {
2752		payload  = *sd->ifsd_cl;
2753		payload +=  ri->iri_pad;
2754		len = ri->iri_len - ri->iri_pad;
2755		*pf_rv = pfil_run_hooks(rxq->pfil, payload, ri->iri_ifp,
2756		    len | PFIL_MEMPTR | PFIL_IN, NULL);
2757		switch (*pf_rv) {
2758		case PFIL_DROPPED:
2759		case PFIL_CONSUMED:
2760			/*
2761			 * The filter ate it.  Everything is recycled.
2762			 */
2763			m = NULL;
2764			unload = 0;
2765			break;
2766		case PFIL_REALLOCED:
2767			/*
2768			 * The filter copied it.  Everything is recycled.
2769			 */
2770			m = pfil_mem2mbuf(payload);
2771			unload = 0;
2772			break;
2773		case PFIL_PASS:
2774			/*
2775			 * Filter said it was OK, so receive like
2776			 * normal
2777			 */
2778			fl->ifl_sds.ifsd_m[cidx] = NULL;
2779			break;
2780		default:
2781			MPASS(0);
2782		}
2783	} else
2784#endif
2785	{
2786		fl->ifl_sds.ifsd_m[cidx] = NULL;
2787		if (pf_rv != NULL)
2788			*pf_rv = PFIL_PASS;
2789	}
2790
2791	if (unload && irf->irf_len != 0)
2792		bus_dmamap_unload(fl->ifl_buf_tag, map);
2793	fl->ifl_cidx = (fl->ifl_cidx + 1) & (fl->ifl_size-1);
2794	if (__predict_false(fl->ifl_cidx == 0))
2795		fl->ifl_gen = 0;
2796	bit_clear(fl->ifl_rx_bitmap, cidx);
2797	return (m);
2798}
2799
2800static struct mbuf *
2801assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri, if_rxsd_t sd, int *pf_rv)
2802{
2803	struct mbuf *m, *mh, *mt;
2804	caddr_t cl;
2805	int  *pf_rv_ptr, flags, i, padlen;
2806	bool consumed;
2807
2808	i = 0;
2809	mh = NULL;
2810	consumed = false;
2811	*pf_rv = PFIL_PASS;
2812	pf_rv_ptr = pf_rv;
2813	do {
2814		m = rxd_frag_to_sd(rxq, &ri->iri_frags[i], !consumed, sd,
2815		    pf_rv_ptr, ri);
2816
2817		MPASS(*sd->ifsd_cl != NULL);
2818
2819		/*
2820		 * Exclude zero-length frags & frags from
2821		 * packets the filter has consumed or dropped
2822		 */
2823		if (ri->iri_frags[i].irf_len == 0 || consumed ||
2824#ifndef __HAIKU__
2825		    *pf_rv == PFIL_CONSUMED || *pf_rv == PFIL_DROPPED
2826#else
2827			0
2828#endif
2829		     ) {
2830			if (mh == NULL) {
2831				/* everything saved here */
2832				consumed = true;
2833				pf_rv_ptr = NULL;
2834				continue;
2835			}
2836			/* XXX we can save the cluster here, but not the mbuf */
2837			m_init(m, M_NOWAIT, MT_DATA, 0);
2838			m_free(m);
2839			continue;
2840		}
2841		if (mh == NULL) {
2842			flags = M_PKTHDR|M_EXT;
2843			mh = mt = m;
2844			padlen = ri->iri_pad;
2845		} else {
2846			flags = M_EXT;
2847			mt->m_next = m;
2848			mt = m;
2849			/* assuming padding is only on the first fragment */
2850			padlen = 0;
2851		}
2852		cl = *sd->ifsd_cl;
2853		*sd->ifsd_cl = NULL;
2854
2855		/* Can these two be made one ? */
2856		m_init(m, M_NOWAIT, MT_DATA, flags);
2857		m_cljset(m, cl, sd->ifsd_fl->ifl_cltype);
2858		/*
2859		 * These must follow m_init and m_cljset
2860		 */
2861		m->m_data += padlen;
2862		ri->iri_len -= padlen;
2863		m->m_len = ri->iri_frags[i].irf_len;
2864	} while (++i < ri->iri_nfrags);
2865
2866	return (mh);
2867}
2868
2869/*
2870 * Process one software descriptor
2871 */
2872static struct mbuf *
2873iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri)
2874{
2875	struct if_rxsd sd;
2876	struct mbuf *m;
2877	int pf_rv;
2878
2879	/* should I merge this back in now that the two paths are basically duplicated? */
2880	if (ri->iri_nfrags == 1 &&
2881	    ri->iri_frags[0].irf_len != 0 &&
2882	    ri->iri_frags[0].irf_len <= MIN(IFLIB_RX_COPY_THRESH, MHLEN)) {
2883		m = rxd_frag_to_sd(rxq, &ri->iri_frags[0], false, &sd,
2884		    &pf_rv, ri);
2885		if (pf_rv != PFIL_PASS
2886#ifndef __HAIKU__
2887		        && pf_rv != PFIL_REALLOCED
2888#endif
2889		        )
2890			return (m);
2891		if (pf_rv == PFIL_PASS) {
2892			m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
2893#ifndef __NO_STRICT_ALIGNMENT
2894			if (!IP_ALIGNED(m))
2895				m->m_data += 2;
2896#endif
2897			memcpy(m->m_data, *sd.ifsd_cl, ri->iri_len);
2898			m->m_len = ri->iri_frags[0].irf_len;
2899		}
2900	} else {
2901		m = assemble_segments(rxq, ri, &sd, &pf_rv);
2902		if (m == NULL)
2903			return (NULL);
2904		if (pf_rv != PFIL_PASS
2905#ifndef __HAIKU__
2906		        && pf_rv != PFIL_REALLOCED
2907#endif
2908				)
2909			return (m);
2910	}
2911	m->m_pkthdr.len = ri->iri_len;
2912	m->m_pkthdr.rcvif = ri->iri_ifp;
2913	m->m_flags |= ri->iri_flags;
2914	m->m_pkthdr.ether_vtag = ri->iri_vtag;
2915	m->m_pkthdr.flowid = ri->iri_flowid;
2916	M_HASHTYPE_SET(m, ri->iri_rsstype);
2917	m->m_pkthdr.csum_flags = ri->iri_csum_flags;
2918	m->m_pkthdr.csum_data = ri->iri_csum_data;
2919	return (m);
2920}
2921
2922#if defined(INET6) || defined(INET)
2923static void
2924iflib_get_ip_forwarding(struct lro_ctrl *lc, bool *v4, bool *v6)
2925{
2926	CURVNET_SET(lc->ifp->if_vnet);
2927#if defined(INET6)
2928	*v6 = V_ip6_forwarding;
2929#endif
2930#if defined(INET)
2931	*v4 = V_ipforwarding;
2932#endif
2933	CURVNET_RESTORE();
2934}
2935
2936/*
2937 * Returns true if it's possible this packet could be LROed.
2938 * if it returns false, it is guaranteed that tcp_lro_rx()
2939 * would not return zero.
2940 */
2941static bool
2942iflib_check_lro_possible(struct mbuf *m, bool v4_forwarding, bool v6_forwarding)
2943{
2944#ifndef __HAIKU__
2945	struct ether_header *eh;
2946
2947	eh = mtod(m, struct ether_header *);
2948	switch (eh->ether_type) {
2949#if defined(INET6)
2950		case htons(ETHERTYPE_IPV6):
2951			return (!v6_forwarding);
2952#endif
2953#if defined (INET)
2954		case htons(ETHERTYPE_IP):
2955			return (!v4_forwarding);
2956#endif
2957	}
2958#endif
2959
2960	return false;
2961}
2962#else
2963static void
2964iflib_get_ip_forwarding(struct lro_ctrl *lc __unused, bool *v4 __unused, bool *v6 __unused)
2965{
2966}
2967#endif
2968
2969static void
2970_task_fn_rx_watchdog(void *context)
2971{
2972	iflib_rxq_t rxq = context;
2973
2974	GROUPTASK_ENQUEUE(&rxq->ifr_task);
2975}
2976
2977static uint8_t
2978iflib_rxeof(iflib_rxq_t rxq, qidx_t budget)
2979{
2980	if_t ifp;
2981	if_ctx_t ctx = rxq->ifr_ctx;
2982	if_shared_ctx_t sctx = ctx->ifc_sctx;
2983	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2984	int avail, i;
2985	qidx_t *cidxp;
2986	struct if_rxd_info ri;
2987	int err, budget_left, rx_bytes, rx_pkts;
2988	iflib_fl_t fl;
2989	int lro_enabled;
2990	bool v4_forwarding, v6_forwarding, lro_possible;
2991	uint8_t retval = 0;
2992
2993	/*
2994	 * XXX early demux data packets so that if_input processing only handles
2995	 * acks in interrupt context
2996	 */
2997	struct mbuf *m, *mh, *mt, *mf;
2998
2999#ifndef __HAIKU__
3000	NET_EPOCH_ASSERT();
3001#endif
3002
3003	lro_possible = v4_forwarding = v6_forwarding = false;
3004	ifp = ctx->ifc_ifp;
3005	mh = mt = NULL;
3006	MPASS(budget > 0);
3007	rx_pkts	= rx_bytes = 0;
3008	if (sctx->isc_flags & IFLIB_HAS_RXCQ)
3009		cidxp = &rxq->ifr_cq_cidx;
3010	else
3011		cidxp = &rxq->ifr_fl[0].ifl_cidx;
3012	if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) {
3013		for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
3014			retval |= iflib_fl_refill_all(ctx, fl);
3015		DBG_COUNTER_INC(rx_unavail);
3016		return (retval);
3017	}
3018
3019	/* pfil needs the vnet to be set */
3020	CURVNET_SET_QUIET(ifp->if_vnet);
3021	for (budget_left = budget; budget_left > 0 && avail > 0;) {
3022		if (__predict_false(!CTX_ACTIVE(ctx))) {
3023			DBG_COUNTER_INC(rx_ctx_inactive);
3024			break;
3025		}
3026		/*
3027		 * Reset client set fields to their default values
3028		 */
3029		rxd_info_zero(&ri);
3030		ri.iri_qsidx = rxq->ifr_id;
3031		ri.iri_cidx = *cidxp;
3032		ri.iri_ifp = ifp;
3033		ri.iri_frags = rxq->ifr_frags;
3034		err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
3035
3036		if (err)
3037			goto err;
3038		rx_pkts += 1;
3039		rx_bytes += ri.iri_len;
3040		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
3041			*cidxp = ri.iri_cidx;
3042			/* Update our consumer index */
3043			/* XXX NB: shurd - check if this is still safe */
3044			while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0])
3045				rxq->ifr_cq_cidx -= scctx->isc_nrxd[0];
3046			/* was this only a completion queue message? */
3047			if (__predict_false(ri.iri_nfrags == 0))
3048				continue;
3049		}
3050		MPASS(ri.iri_nfrags != 0);
3051		MPASS(ri.iri_len != 0);
3052
3053		/* will advance the cidx on the corresponding free lists */
3054		m = iflib_rxd_pkt_get(rxq, &ri);
3055		avail--;
3056		budget_left--;
3057		if (avail == 0 && budget_left)
3058			avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left);
3059
3060		if (__predict_false(m == NULL))
3061			continue;
3062
3063		/* imm_pkt: -- cxgb */
3064		if (mh == NULL)
3065			mh = mt = m;
3066		else {
3067			mt->m_nextpkt = m;
3068			mt = m;
3069		}
3070	}
3071	CURVNET_RESTORE();
3072	/* make sure that we can refill faster than drain */
3073	for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
3074		retval |= iflib_fl_refill_all(ctx, fl);
3075
3076	lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO);
3077#ifndef __HAIKU__
3078	if (lro_enabled)
3079		iflib_get_ip_forwarding(&rxq->ifr_lc, &v4_forwarding, &v6_forwarding);
3080#endif
3081	mt = mf = NULL;
3082	while (mh != NULL) {
3083		m = mh;
3084		mh = mh->m_nextpkt;
3085		m->m_nextpkt = NULL;
3086#ifndef __NO_STRICT_ALIGNMENT
3087		if (!IP_ALIGNED(m) && (m = iflib_fixup_rx(m)) == NULL)
3088			continue;
3089#endif
3090#ifndef __HAIKU__
3091#if defined(INET6) || defined(INET)
3092		if (lro_enabled) {
3093			if (!lro_possible) {
3094				lro_possible = iflib_check_lro_possible(m, v4_forwarding, v6_forwarding);
3095				if (lro_possible && mf != NULL) {
3096					ifp->if_input(ifp, mf);
3097					DBG_COUNTER_INC(rx_if_input);
3098					mt = mf = NULL;
3099				}
3100			}
3101			if ((m->m_pkthdr.csum_flags & (CSUM_L4_CALC|CSUM_L4_VALID)) ==
3102			    (CSUM_L4_CALC|CSUM_L4_VALID)) {
3103				if (lro_possible && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
3104					continue;
3105			}
3106		}
3107#endif
3108		if (lro_possible) {
3109			ifp->if_input(ifp, m);
3110			DBG_COUNTER_INC(rx_if_input);
3111			continue;
3112		}
3113#endif
3114
3115		if (mf == NULL)
3116			mf = m;
3117		if (mt != NULL)
3118			mt->m_nextpkt = m;
3119		mt = m;
3120	}
3121	if (mf != NULL) {
3122		ifp->if_input(ifp, mf);
3123		DBG_COUNTER_INC(rx_if_input);
3124	}
3125
3126	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
3127	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
3128
3129	/*
3130	 * Flush any outstanding LRO work
3131	 */
3132#if defined(INET6) || defined(INET)
3133#ifndef __HAIKU__
3134	tcp_lro_flush_all(&rxq->ifr_lc);
3135#endif
3136#endif
3137	if (avail != 0 || iflib_rxd_avail(ctx, rxq, *cidxp, 1) != 0)
3138		retval |= IFLIB_RXEOF_MORE;
3139	return (retval);
3140err:
3141	STATE_LOCK(ctx);
3142	ctx->ifc_flags |= IFC_DO_RESET;
3143	iflib_admin_intr_deferred(ctx);
3144	STATE_UNLOCK(ctx);
3145	return (0);
3146}
3147
3148#define TXD_NOTIFY_COUNT(txq) (((txq)->ift_size / (txq)->ift_update_freq)-1)
3149static inline qidx_t
3150txq_max_db_deferred(iflib_txq_t txq, qidx_t in_use)
3151{
3152	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
3153	qidx_t minthresh = txq->ift_size / 8;
3154	if (in_use > 4*minthresh)
3155		return (notify_count);
3156	if (in_use > 2*minthresh)
3157		return (notify_count >> 1);
3158	if (in_use > minthresh)
3159		return (notify_count >> 3);
3160	return (0);
3161}
3162
3163static inline qidx_t
3164txq_max_rs_deferred(iflib_txq_t txq)
3165{
3166	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
3167	qidx_t minthresh = txq->ift_size / 8;
3168	if (txq->ift_in_use > 4*minthresh)
3169		return (notify_count);
3170	if (txq->ift_in_use > 2*minthresh)
3171		return (notify_count >> 1);
3172	if (txq->ift_in_use > minthresh)
3173		return (notify_count >> 2);
3174	return (2);
3175}
3176
3177#define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags)
3178#define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG)
3179
3180#define TXQ_MAX_DB_DEFERRED(txq, in_use) txq_max_db_deferred((txq), (in_use))
3181#define TXQ_MAX_RS_DEFERRED(txq) txq_max_rs_deferred(txq)
3182#define TXQ_MAX_DB_CONSUMED(size) (size >> 4)
3183
3184/* forward compatibility for cxgb */
3185#define FIRST_QSET(ctx) 0
3186#define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
3187#define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
3188#define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
3189#define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
3190
3191/* XXX we should be setting this to something other than zero */
3192#define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
3193#define	MAX_TX_DESC(ctx) MAX((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max, \
3194    (ctx)->ifc_softc_ctx.isc_tx_nsegments)
3195
3196static inline bool
3197iflib_txd_db_check(iflib_txq_t txq, int ring)
3198{
3199	if_ctx_t ctx = txq->ift_ctx;
3200	qidx_t dbval, max;
3201
3202	max = TXQ_MAX_DB_DEFERRED(txq, txq->ift_in_use);
3203
3204	/* force || threshold exceeded || at the edge of the ring */
3205	if (ring || (txq->ift_db_pending >= max) || (TXQ_AVAIL(txq) <= MAX_TX_DESC(ctx) + 2)) {
3206
3207		/*
3208		 * 'npending' is used if the card's doorbell is in terms of the number of descriptors
3209		 * pending flush (BRCM). 'pidx' is used in cases where the card's doorbeel uses the
3210		 * producer index explicitly (INTC).
3211		 */
3212		dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx;
3213		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
3214		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
3215		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval);
3216
3217		/*
3218		 * Absent bugs there are zero packets pending so reset pending counts to zero.
3219		 */
3220		txq->ift_db_pending = txq->ift_npending = 0;
3221		return (true);
3222	}
3223	return (false);
3224}
3225
3226#ifdef PKT_DEBUG
3227static void
3228print_pkt(if_pkt_info_t pi)
3229{
3230	printf("pi len:  %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n",
3231	       pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx);
3232	printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n",
3233	       pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag);
3234	printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n",
3235	       pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto);
3236}
3237#endif
3238
3239#define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO)
3240#define IS_TX_OFFLOAD4(pi) ((pi)->ipi_csum_flags & (CSUM_IP_TCP | CSUM_IP_TSO))
3241#define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO)
3242#define IS_TX_OFFLOAD6(pi) ((pi)->ipi_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_TSO))
3243
3244static int
3245iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
3246{
3247	if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx;
3248	struct ether_vlan_header *eh;
3249	struct mbuf *m;
3250
3251	m = *mp;
3252	if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) &&
3253	    M_WRITABLE(m) == 0) {
3254		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
3255			return (ENOMEM);
3256		} else {
3257			m_freem(*mp);
3258			DBG_COUNTER_INC(tx_frees);
3259			*mp = m;
3260		}
3261	}
3262
3263	/*
3264	 * Determine where frame payload starts.
3265	 * Jump over vlan headers if already present,
3266	 * helpful for QinQ too.
3267	 */
3268	if (__predict_false(m->m_len < sizeof(*eh))) {
3269		txq->ift_pullups++;
3270		if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL))
3271			return (ENOMEM);
3272	}
3273	eh = mtod(m, struct ether_vlan_header *);
3274	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
3275		pi->ipi_etype = ntohs(eh->evl_proto);
3276		pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3277	} else {
3278		pi->ipi_etype = ntohs(eh->evl_encap_proto);
3279		pi->ipi_ehdrlen = ETHER_HDR_LEN;
3280	}
3281
3282	switch (pi->ipi_etype) {
3283#ifdef INET
3284	case ETHERTYPE_IP:
3285	{
3286		struct mbuf *n;
3287		struct ip *ip = NULL;
3288		struct tcphdr *th = NULL;
3289		int minthlen;
3290
3291		minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th));
3292		if (__predict_false(m->m_len < minthlen)) {
3293			/*
3294			 * if this code bloat is causing too much of a hit
3295			 * move it to a separate function and mark it noinline
3296			 */
3297			if (m->m_len == pi->ipi_ehdrlen) {
3298				n = m->m_next;
3299				MPASS(n);
3300				if (n->m_len >= sizeof(*ip))  {
3301					ip = (struct ip *)n->m_data;
3302					if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th))
3303						th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
3304				} else {
3305					txq->ift_pullups++;
3306					if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
3307						return (ENOMEM);
3308					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3309				}
3310			} else {
3311				txq->ift_pullups++;
3312				if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
3313					return (ENOMEM);
3314				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3315				if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
3316					th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
3317			}
3318		} else {
3319			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3320			if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
3321				th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
3322		}
3323		pi->ipi_ip_hlen = ip->ip_hl << 2;
3324		pi->ipi_ipproto = ip->ip_p;
3325		pi->ipi_flags |= IPI_TX_IPV4;
3326
3327		/* TCP checksum offload may require TCP header length */
3328		if (IS_TX_OFFLOAD4(pi)) {
3329			if (__predict_true(pi->ipi_ipproto == IPPROTO_TCP)) {
3330				if (__predict_false(th == NULL)) {
3331					txq->ift_pullups++;
3332					if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
3333						return (ENOMEM);
3334					th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
3335				}
3336				pi->ipi_tcp_hflags = th->th_flags;
3337				pi->ipi_tcp_hlen = th->th_off << 2;
3338				pi->ipi_tcp_seq = th->th_seq;
3339			}
3340			if (IS_TSO4(pi)) {
3341				if (__predict_false(ip->ip_p != IPPROTO_TCP))
3342					return (ENXIO);
3343				/*
3344				 * TSO always requires hardware checksum offload.
3345				 */
3346				pi->ipi_csum_flags |= (CSUM_IP_TCP | CSUM_IP);
3347				th->th_sum = in_pseudo(ip->ip_src.s_addr,
3348						       ip->ip_dst.s_addr, htons(IPPROTO_TCP));
3349				pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
3350				if (sctx->isc_flags & IFLIB_TSO_INIT_IP) {
3351					ip->ip_sum = 0;
3352					ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz);
3353				}
3354			}
3355		}
3356		if ((sctx->isc_flags & IFLIB_NEED_ZERO_CSUM) && (pi->ipi_csum_flags & CSUM_IP))
3357                       ip->ip_sum = 0;
3358
3359		break;
3360	}
3361#endif
3362#ifdef INET6
3363	case ETHERTYPE_IPV6:
3364	{
3365		struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
3366		struct tcphdr *th;
3367		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
3368
3369		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
3370			txq->ift_pullups++;
3371			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
3372				return (ENOMEM);
3373		}
3374		th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen);
3375
3376		/* XXX-BZ this will go badly in case of ext hdrs. */
3377		pi->ipi_ipproto = ip6->ip6_nxt;
3378		pi->ipi_flags |= IPI_TX_IPV6;
3379
3380		/* TCP checksum offload may require TCP header length */
3381		if (IS_TX_OFFLOAD6(pi)) {
3382			if (pi->ipi_ipproto == IPPROTO_TCP) {
3383				if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
3384					txq->ift_pullups++;
3385					if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
3386						return (ENOMEM);
3387				}
3388				pi->ipi_tcp_hflags = th->th_flags;
3389				pi->ipi_tcp_hlen = th->th_off << 2;
3390				pi->ipi_tcp_seq = th->th_seq;
3391			}
3392			if (IS_TSO6(pi)) {
3393				if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP))
3394					return (ENXIO);
3395				/*
3396				 * TSO always requires hardware checksum offload.
3397				 */
3398				pi->ipi_csum_flags |= CSUM_IP6_TCP;
3399				th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
3400				pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
3401			}
3402		}
3403		break;
3404	}
3405#endif
3406	default:
3407		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
3408		pi->ipi_ip_hlen = 0;
3409		break;
3410	}
3411	*mp = m;
3412
3413	return (0);
3414}
3415
3416/*
3417 * If dodgy hardware rejects the scatter gather chain we've handed it
3418 * we'll need to remove the mbuf chain from ifsg_m[] before we can add the
3419 * m_defrag'd mbufs
3420 */
3421static __noinline struct mbuf *
3422iflib_remove_mbuf(iflib_txq_t txq)
3423{
3424	int ntxd, pidx;
3425	struct mbuf *m, **ifsd_m;
3426
3427	ifsd_m = txq->ift_sds.ifsd_m;
3428	ntxd = txq->ift_size;
3429	pidx = txq->ift_pidx & (ntxd - 1);
3430	ifsd_m = txq->ift_sds.ifsd_m;
3431	m = ifsd_m[pidx];
3432	ifsd_m[pidx] = NULL;
3433	bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[pidx]);
3434	if (txq->ift_sds.ifsd_tso_map != NULL)
3435		bus_dmamap_unload(txq->ift_tso_buf_tag,
3436		    txq->ift_sds.ifsd_tso_map[pidx]);
3437#if MEMORY_LOGGING
3438	txq->ift_dequeued++;
3439#endif
3440	return (m);
3441}
3442
3443static inline caddr_t
3444calc_next_txd(iflib_txq_t txq, int cidx, uint8_t qid)
3445{
3446	qidx_t size;
3447	int ntxd;
3448	caddr_t start, end, cur, next;
3449
3450	ntxd = txq->ift_size;
3451	size = txq->ift_txd_size[qid];
3452	start = txq->ift_ifdi[qid].idi_vaddr;
3453
3454	if (__predict_false(size == 0))
3455		return (start);
3456	cur = start + size*cidx;
3457	end = start + size*ntxd;
3458	next = CACHE_PTR_NEXT(cur);
3459	return (next < end ? next : start);
3460}
3461
3462/*
3463 * Pad an mbuf to ensure a minimum ethernet frame size.
3464 * min_frame_size is the frame size (less CRC) to pad the mbuf to
3465 */
3466static __noinline int
3467iflib_ether_pad(device_t dev, struct mbuf **m_head, uint16_t min_frame_size)
3468{
3469	/*
3470	 * 18 is enough bytes to pad an ARP packet to 46 bytes, and
3471	 * and ARP message is the smallest common payload I can think of
3472	 */
3473	static char pad[18];	/* just zeros */
3474	int n;
3475	struct mbuf *new_head;
3476
3477	if (!M_WRITABLE(*m_head)) {
3478		new_head = m_dup(*m_head, M_NOWAIT);
3479		if (new_head == NULL) {
3480			m_freem(*m_head);
3481			device_printf(dev, "cannot pad short frame, m_dup() failed");
3482			DBG_COUNTER_INC(encap_pad_mbuf_fail);
3483			DBG_COUNTER_INC(tx_frees);
3484			return ENOMEM;
3485		}
3486		m_freem(*m_head);
3487		*m_head = new_head;
3488	}
3489
3490	for (n = min_frame_size - (*m_head)->m_pkthdr.len;
3491	     n > 0; n -= sizeof(pad))
3492		if (!m_append(*m_head, min(n, sizeof(pad)), pad))
3493			break;
3494
3495	if (n > 0) {
3496		m_freem(*m_head);
3497		device_printf(dev, "cannot pad short frame\n");
3498		DBG_COUNTER_INC(encap_pad_mbuf_fail);
3499		DBG_COUNTER_INC(tx_frees);
3500		return (ENOBUFS);
3501	}
3502
3503	return 0;
3504}
3505
3506static int
3507iflib_encap(iflib_txq_t txq, struct mbuf **m_headp)
3508{
3509	if_ctx_t		ctx;
3510	if_shared_ctx_t		sctx;
3511	if_softc_ctx_t		scctx;
3512	bus_dma_tag_t		buf_tag;
3513	bus_dma_segment_t	*segs;
3514	struct mbuf		*m_head, **ifsd_m;
3515	void			*next_txd;
3516	bus_dmamap_t		map;
3517	struct if_pkt_info	pi;
3518	int remap = 0;
3519	int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd;
3520
3521	ctx = txq->ift_ctx;
3522	sctx = ctx->ifc_sctx;
3523	scctx = &ctx->ifc_softc_ctx;
3524	segs = txq->ift_segs;
3525	ntxd = txq->ift_size;
3526	m_head = *m_headp;
3527	map = NULL;
3528
3529	/*
3530	 * If we're doing TSO the next descriptor to clean may be quite far ahead
3531	 */
3532	cidx = txq->ift_cidx;
3533	pidx = txq->ift_pidx;
3534	if (ctx->ifc_flags & IFC_PREFETCH) {
3535		next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1);
3536		if (!(ctx->ifc_flags & IFLIB_HAS_TXCQ)) {
3537			next_txd = calc_next_txd(txq, cidx, 0);
3538			prefetch(next_txd);
3539		}
3540
3541		/* prefetch the next cache line of mbuf pointers and flags */
3542		prefetch(&txq->ift_sds.ifsd_m[next]);
3543		prefetch(&txq->ift_sds.ifsd_map[next]);
3544		next = (cidx + CACHE_LINE_SIZE) & (ntxd-1);
3545	}
3546	map = txq->ift_sds.ifsd_map[pidx];
3547	ifsd_m = txq->ift_sds.ifsd_m;
3548
3549	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3550		buf_tag = txq->ift_tso_buf_tag;
3551		max_segs = scctx->isc_tx_tso_segments_max;
3552		map = txq->ift_sds.ifsd_tso_map[pidx];
3553		MPASS(buf_tag != NULL);
3554		MPASS(max_segs > 0);
3555	} else {
3556		buf_tag = txq->ift_buf_tag;
3557		max_segs = scctx->isc_tx_nsegments;
3558		map = txq->ift_sds.ifsd_map[pidx];
3559	}
3560	if ((sctx->isc_flags & IFLIB_NEED_ETHER_PAD) &&
3561	    __predict_false(m_head->m_pkthdr.len < scctx->isc_min_frame_size)) {
3562		err = iflib_ether_pad(ctx->ifc_dev, m_headp, scctx->isc_min_frame_size);
3563		if (err) {
3564			DBG_COUNTER_INC(encap_txd_encap_fail);
3565			return err;
3566		}
3567	}
3568	m_head = *m_headp;
3569
3570	pkt_info_zero(&pi);
3571	pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST));
3572	pi.ipi_pidx = pidx;
3573	pi.ipi_qsidx = txq->ift_id;
3574	pi.ipi_len = m_head->m_pkthdr.len;
3575	pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
3576	pi.ipi_vtag = M_HAS_VLANTAG(m_head) ? m_head->m_pkthdr.ether_vtag : 0;
3577
3578	/* deliberate bitwise OR to make one condition */
3579	if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) {
3580		if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0)) {
3581			DBG_COUNTER_INC(encap_txd_encap_fail);
3582			return (err);
3583		}
3584		m_head = *m_headp;
3585	}
3586
3587retry:
3588	err = bus_dmamap_load_mbuf_sg(buf_tag, map, m_head, segs, &nsegs,
3589	    BUS_DMA_NOWAIT);
3590defrag:
3591	if (__predict_false(err)) {
3592		switch (err) {
3593		case EFBIG:
3594			/* try collapse once and defrag once */
3595			if (remap == 0) {
3596				m_head = m_collapse(*m_headp, M_NOWAIT, max_segs);
3597				/* try defrag if collapsing fails */
3598				if (m_head == NULL)
3599					remap++;
3600			}
3601			if (remap == 1) {
3602				txq->ift_mbuf_defrag++;
3603				m_head = m_defrag(*m_headp, M_NOWAIT);
3604			}
3605			/*
3606			 * remap should never be >1 unless bus_dmamap_load_mbuf_sg
3607			 * failed to map an mbuf that was run through m_defrag
3608			 */
3609			MPASS(remap <= 1);
3610			if (__predict_false(m_head == NULL || remap > 1))
3611				goto defrag_failed;
3612			remap++;
3613			*m_headp = m_head;
3614			goto retry;
3615			break;
3616		case ENOMEM:
3617			txq->ift_no_tx_dma_setup++;
3618			break;
3619		default:
3620			txq->ift_no_tx_dma_setup++;
3621			m_freem(*m_headp);
3622			DBG_COUNTER_INC(tx_frees);
3623			*m_headp = NULL;
3624			break;
3625		}
3626		txq->ift_map_failed++;
3627		DBG_COUNTER_INC(encap_load_mbuf_fail);
3628		DBG_COUNTER_INC(encap_txd_encap_fail);
3629		return (err);
3630	}
3631	ifsd_m[pidx] = m_head;
3632	/*
3633	 * XXX assumes a 1 to 1 relationship between segments and
3634	 *        descriptors - this does not hold true on all drivers, e.g.
3635	 *        cxgb
3636	 */
3637	if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
3638		txq->ift_no_desc_avail++;
3639		bus_dmamap_unload(buf_tag, map);
3640		DBG_COUNTER_INC(encap_txq_avail_fail);
3641		DBG_COUNTER_INC(encap_txd_encap_fail);
3642		if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
3643			GROUPTASK_ENQUEUE(&txq->ift_task);
3644		return (ENOBUFS);
3645	}
3646	/*
3647	 * On Intel cards we can greatly reduce the number of TX interrupts
3648	 * we see by only setting report status on every Nth descriptor.
3649	 * However, this also means that the driver will need to keep track
3650	 * of the descriptors that RS was set on to check them for the DD bit.
3651	 */
3652	txq->ift_rs_pending += nsegs + 1;
3653	if (txq->ift_rs_pending > TXQ_MAX_RS_DEFERRED(txq) ||
3654	     iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs) <= MAX_TX_DESC(ctx) + 2) {
3655		pi.ipi_flags |= IPI_TX_INTR;
3656		txq->ift_rs_pending = 0;
3657	}
3658
3659	pi.ipi_segs = segs;
3660	pi.ipi_nsegs = nsegs;
3661
3662	MPASS(pidx >= 0 && pidx < txq->ift_size);
3663#ifdef PKT_DEBUG
3664	print_pkt(&pi);
3665#endif
3666	if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) {
3667		bus_dmamap_sync(buf_tag, map, BUS_DMASYNC_PREWRITE);
3668		DBG_COUNTER_INC(tx_encap);
3669		MPASS(pi.ipi_new_pidx < txq->ift_size);
3670
3671		ndesc = pi.ipi_new_pidx - pi.ipi_pidx;
3672		if (pi.ipi_new_pidx < pi.ipi_pidx) {
3673			ndesc += txq->ift_size;
3674			txq->ift_gen = 1;
3675		}
3676		/*
3677		 * drivers can need as many as
3678		 * two sentinels
3679		 */
3680		MPASS(ndesc <= pi.ipi_nsegs + 2);
3681		MPASS(pi.ipi_new_pidx != pidx);
3682		MPASS(ndesc > 0);
3683		txq->ift_in_use += ndesc;
3684		txq->ift_db_pending += ndesc;
3685
3686		/*
3687		 * We update the last software descriptor again here because there may
3688		 * be a sentinel and/or there may be more mbufs than segments
3689		 */
3690		txq->ift_pidx = pi.ipi_new_pidx;
3691		txq->ift_npending += pi.ipi_ndescs;
3692	} else {
3693		*m_headp = m_head = iflib_remove_mbuf(txq);
3694		if (err == EFBIG) {
3695			txq->ift_txd_encap_efbig++;
3696			if (remap < 2) {
3697				remap = 1;
3698				goto defrag;
3699			}
3700		}
3701		goto defrag_failed;
3702	}
3703	/*
3704	 * err can't possibly be non-zero here, so we don't neet to test it
3705	 * to see if we need to DBG_COUNTER_INC(encap_txd_encap_fail).
3706	 */
3707	return (err);
3708
3709defrag_failed:
3710	txq->ift_mbuf_defrag_failed++;
3711	txq->ift_map_failed++;
3712	m_freem(*m_headp);
3713	DBG_COUNTER_INC(tx_frees);
3714	*m_headp = NULL;
3715	DBG_COUNTER_INC(encap_txd_encap_fail);
3716	return (ENOMEM);
3717}
3718
3719static void
3720iflib_tx_desc_free(iflib_txq_t txq, int n)
3721{
3722	uint32_t qsize, cidx, mask, gen;
3723	struct mbuf *m, **ifsd_m;
3724	bool do_prefetch;
3725
3726	cidx = txq->ift_cidx;
3727	gen = txq->ift_gen;
3728	qsize = txq->ift_size;
3729	mask = qsize-1;
3730	ifsd_m = txq->ift_sds.ifsd_m;
3731	do_prefetch = (txq->ift_ctx->ifc_flags & IFC_PREFETCH);
3732
3733	while (n-- > 0) {
3734		if (do_prefetch) {
3735			prefetch(ifsd_m[(cidx + 3) & mask]);
3736			prefetch(ifsd_m[(cidx + 4) & mask]);
3737		}
3738		if ((m = ifsd_m[cidx]) != NULL) {
3739			prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]);
3740			if (m->m_pkthdr.csum_flags & CSUM_TSO) {
3741				bus_dmamap_sync(txq->ift_tso_buf_tag,
3742				    txq->ift_sds.ifsd_tso_map[cidx],
3743				    BUS_DMASYNC_POSTWRITE);
3744				bus_dmamap_unload(txq->ift_tso_buf_tag,
3745				    txq->ift_sds.ifsd_tso_map[cidx]);
3746			} else {
3747				bus_dmamap_sync(txq->ift_buf_tag,
3748				    txq->ift_sds.ifsd_map[cidx],
3749				    BUS_DMASYNC_POSTWRITE);
3750				bus_dmamap_unload(txq->ift_buf_tag,
3751				    txq->ift_sds.ifsd_map[cidx]);
3752			}
3753			/* XXX we don't support any drivers that batch packets yet */
3754			MPASS(m->m_nextpkt == NULL);
3755			m_freem(m);
3756			ifsd_m[cidx] = NULL;
3757#if MEMORY_LOGGING
3758			txq->ift_dequeued++;
3759#endif
3760			DBG_COUNTER_INC(tx_frees);
3761		}
3762		if (__predict_false(++cidx == qsize)) {
3763			cidx = 0;
3764			gen = 0;
3765		}
3766	}
3767	txq->ift_cidx = cidx;
3768	txq->ift_gen = gen;
3769}
3770
3771static __inline int
3772iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh)
3773{
3774	int reclaim;
3775	if_ctx_t ctx = txq->ift_ctx;
3776
3777	KASSERT(thresh >= 0, ("invalid threshold to reclaim"));
3778	MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size);
3779
3780	/*
3781	 * Need a rate-limiting check so that this isn't called every time
3782	 */
3783	iflib_tx_credits_update(ctx, txq);
3784	reclaim = DESC_RECLAIMABLE(txq);
3785
3786	if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) {
3787#ifdef INVARIANTS
3788		if (iflib_verbose_debug) {
3789			printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __FUNCTION__,
3790			       txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments,
3791			       reclaim, thresh);
3792		}
3793#endif
3794		return (0);
3795	}
3796	iflib_tx_desc_free(txq, reclaim);
3797	txq->ift_cleaned += reclaim;
3798	txq->ift_in_use -= reclaim;
3799
3800	return (reclaim);
3801}
3802
3803static struct mbuf **
3804_ring_peek_one(struct ifmp_ring *r, int cidx, int offset, int remaining)
3805{
3806	int next, size;
3807	struct mbuf **items;
3808
3809	size = r->size;
3810	next = (cidx + CACHE_PTR_INCREMENT) & (size-1);
3811	items = __DEVOLATILE(struct mbuf **, &r->items[0]);
3812
3813	prefetch(items[(cidx + offset) & (size-1)]);
3814	if (remaining > 1) {
3815		prefetch2cachelines(&items[next]);
3816		prefetch2cachelines(items[(cidx + offset + 1) & (size-1)]);
3817		prefetch2cachelines(items[(cidx + offset + 2) & (size-1)]);
3818		prefetch2cachelines(items[(cidx + offset + 3) & (size-1)]);
3819	}
3820	return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (size-1)]));
3821}
3822
3823static void
3824iflib_txq_check_drain(iflib_txq_t txq, int budget)
3825{
3826
3827	ifmp_ring_check_drainage(txq->ift_br, budget);
3828}
3829
3830static uint32_t
3831iflib_txq_can_drain(struct ifmp_ring *r)
3832{
3833	iflib_txq_t txq = r->cookie;
3834	if_ctx_t ctx = txq->ift_ctx;
3835
3836	if (TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2)
3837		return (1);
3838	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
3839	    BUS_DMASYNC_POSTREAD);
3840	return (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id,
3841	    false));
3842}
3843
3844static uint32_t
3845iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
3846{
3847	iflib_txq_t txq = r->cookie;
3848	if_ctx_t ctx = txq->ift_ctx;
3849	if_t ifp = ctx->ifc_ifp;
3850	struct mbuf *m, **mp;
3851	int avail, bytes_sent, skipped, count, err, i;
3852	int mcast_sent, pkt_sent, reclaimed;
3853	bool do_prefetch, rang, ring;
3854
3855	if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) ||
3856			    !LINK_ACTIVE(ctx))) {
3857		DBG_COUNTER_INC(txq_drain_notready);
3858		return (0);
3859	}
3860	reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
3861	rang = iflib_txd_db_check(txq, reclaimed && txq->ift_db_pending);
3862	avail = IDXDIFF(pidx, cidx, r->size);
3863
3864	if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) {
3865		/*
3866		 * The driver is unloading so we need to free all pending packets.
3867		 */
3868		DBG_COUNTER_INC(txq_drain_flushing);
3869		for (i = 0; i < avail; i++) {
3870			if (__predict_true(r->items[(cidx + i) & (r->size-1)] != (void *)txq))
3871				m_freem(r->items[(cidx + i) & (r->size-1)]);
3872			r->items[(cidx + i) & (r->size-1)] = NULL;
3873		}
3874		return (avail);
3875	}
3876
3877	if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
3878		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
3879		CALLOUT_LOCK(txq);
3880		callout_stop(&txq->ift_timer);
3881		CALLOUT_UNLOCK(txq);
3882		DBG_COUNTER_INC(txq_drain_oactive);
3883		return (0);
3884	}
3885
3886	/*
3887	 * If we've reclaimed any packets this queue cannot be hung.
3888	 */
3889	if (reclaimed)
3890		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
3891	skipped = mcast_sent = bytes_sent = pkt_sent = 0;
3892	count = MIN(avail, TX_BATCH_SIZE);
3893#ifdef INVARIANTS
3894	if (iflib_verbose_debug)
3895		printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __FUNCTION__,
3896		       avail, ctx->ifc_flags, TXQ_AVAIL(txq));
3897#endif
3898	do_prefetch = (ctx->ifc_flags & IFC_PREFETCH);
3899	err = 0;
3900	for (i = 0; i < count && TXQ_AVAIL(txq) >= MAX_TX_DESC(ctx) + 2; i++) {
3901		int rem = do_prefetch ? count - i : 0;
3902
3903		mp = _ring_peek_one(r, cidx, i, rem);
3904		MPASS(mp != NULL && *mp != NULL);
3905
3906		/*
3907		 * Completion interrupts will use the address of the txq
3908		 * as a sentinel to enqueue _something_ in order to acquire
3909		 * the lock on the mp_ring (there's no direct lock call).
3910		 * We obviously whave to check for these sentinel cases
3911		 * and skip them.
3912		 */
3913		if (__predict_false(*mp == (struct mbuf *)txq)) {
3914			skipped++;
3915			continue;
3916		}
3917		err = iflib_encap(txq, mp);
3918		if (__predict_false(err)) {
3919			/* no room - bail out */
3920			if (err == ENOBUFS)
3921				break;
3922			skipped++;
3923			/* we can't send this packet - skip it */
3924			continue;
3925		}
3926		pkt_sent++;
3927		m = *mp;
3928		DBG_COUNTER_INC(tx_sent);
3929		bytes_sent += m->m_pkthdr.len;
3930		mcast_sent += !!(m->m_flags & M_MCAST);
3931
3932		if (__predict_false(!(ifp->if_drv_flags & IFF_DRV_RUNNING)))
3933			break;
3934		ETHER_BPF_MTAP(ifp, m);
3935		rang = iflib_txd_db_check(txq, false);
3936	}
3937
3938	/* deliberate use of bitwise or to avoid gratuitous short-circuit */
3939	ring = rang ? false  : (iflib_min_tx_latency | err);
3940	iflib_txd_db_check(txq, ring);
3941	if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
3942	if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
3943	if (mcast_sent)
3944		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
3945#ifdef INVARIANTS
3946	if (iflib_verbose_debug)
3947		printf("consumed=%d\n", skipped + pkt_sent);
3948#endif
3949	return (skipped + pkt_sent);
3950}
3951
3952static uint32_t
3953iflib_txq_drain_always(struct ifmp_ring *r)
3954{
3955	return (1);
3956}
3957
3958static uint32_t
3959iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
3960{
3961	int i, avail;
3962	struct mbuf **mp;
3963	iflib_txq_t txq;
3964
3965	txq = r->cookie;
3966
3967	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
3968	CALLOUT_LOCK(txq);
3969	callout_stop(&txq->ift_timer);
3970	CALLOUT_UNLOCK(txq);
3971
3972	avail = IDXDIFF(pidx, cidx, r->size);
3973	for (i = 0; i < avail; i++) {
3974		mp = _ring_peek_one(r, cidx, i, avail - i);
3975		if (__predict_false(*mp == (struct mbuf *)txq))
3976			continue;
3977		m_freem(*mp);
3978		DBG_COUNTER_INC(tx_frees);
3979	}
3980	MPASS(ifmp_ring_is_stalled(r) == 0);
3981	return (avail);
3982}
3983
3984static void
3985iflib_ifmp_purge(iflib_txq_t txq)
3986{
3987	struct ifmp_ring *r;
3988
3989	r = txq->ift_br;
3990	r->drain = iflib_txq_drain_free;
3991	r->can_drain = iflib_txq_drain_always;
3992
3993	ifmp_ring_check_drainage(r, r->size);
3994
3995	r->drain = iflib_txq_drain;
3996	r->can_drain = iflib_txq_can_drain;
3997}
3998
3999static void
4000_task_fn_tx(void *context)
4001{
4002	iflib_txq_t txq = context;
4003	if_ctx_t ctx = txq->ift_ctx;
4004	if_t ifp = ctx->ifc_ifp;
4005	int abdicate = ctx->ifc_sysctl_tx_abdicate;
4006
4007#ifdef IFLIB_DIAGNOSTICS
4008	txq->ift_cpu_exec_count[curcpu]++;
4009#endif
4010	if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
4011		return;
4012#ifdef DEV_NETMAP
4013	if ((if_getcapenable(ifp) & IFCAP_NETMAP) &&
4014	    netmap_tx_irq(ifp, txq->ift_id))
4015		goto skip_ifmp;
4016#endif
4017#ifdef ALTQ
4018	if (ALTQ_IS_ENABLED(&ifp->if_snd))
4019		iflib_altq_if_start(ifp);
4020#endif
4021	if (txq->ift_db_pending)
4022		ifmp_ring_enqueue(txq->ift_br, (void **)&txq, 1, TX_BATCH_SIZE, abdicate);
4023	else if (!abdicate)
4024		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
4025	/*
4026	 * When abdicating, we always need to check drainage, not just when we don't enqueue
4027	 */
4028	if (abdicate)
4029		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
4030#ifdef DEV_NETMAP
4031skip_ifmp:
4032#endif
4033	if (ctx->ifc_flags & IFC_LEGACY)
4034		IFDI_INTR_ENABLE(ctx);
4035	else
4036		IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
4037}
4038
4039static void
4040_task_fn_rx(void *context)
4041{
4042	iflib_rxq_t rxq = context;
4043	if_ctx_t ctx = rxq->ifr_ctx;
4044	uint8_t more;
4045	uint16_t budget;
4046#ifdef DEV_NETMAP
4047	u_int work = 0;
4048	int nmirq;
4049#endif
4050
4051#ifdef IFLIB_DIAGNOSTICS
4052	rxq->ifr_cpu_exec_count[curcpu]++;
4053#endif
4054	DBG_COUNTER_INC(task_fn_rxs);
4055	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
4056		return;
4057#ifdef DEV_NETMAP
4058	nmirq = netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &work);
4059	if (nmirq != NM_IRQ_PASS) {
4060		more = (nmirq == NM_IRQ_RESCHED) ? IFLIB_RXEOF_MORE : 0;
4061		goto skip_rxeof;
4062	}
4063#endif
4064	budget = ctx->ifc_sysctl_rx_budget;
4065	if (budget == 0)
4066		budget = 16;	/* XXX */
4067	more = iflib_rxeof(rxq, budget);
4068#ifdef DEV_NETMAP
4069skip_rxeof:
4070#endif
4071	if ((more & IFLIB_RXEOF_MORE) == 0) {
4072		if (ctx->ifc_flags & IFC_LEGACY)
4073			IFDI_INTR_ENABLE(ctx);
4074		else
4075			IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
4076		DBG_COUNTER_INC(rx_intr_enables);
4077	}
4078	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
4079		return;
4080
4081	if (more & IFLIB_RXEOF_MORE)
4082		GROUPTASK_ENQUEUE(&rxq->ifr_task);
4083	else if (more & IFLIB_RXEOF_EMPTY)
4084#ifndef __HAIKU__
4085		callout_reset_curcpu(&rxq->ifr_watchdog, 1, &_task_fn_rx_watchdog, rxq);
4086#else
4087		callout_reset(&rxq->ifr_watchdog, 1, &_task_fn_rx_watchdog, rxq);
4088#endif
4089}
4090
4091static void
4092_task_fn_admin(void *context)
4093{
4094	if_ctx_t ctx = context;
4095	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
4096	iflib_txq_t txq;
4097	int i;
4098	bool oactive, running, do_reset, do_watchdog, in_detach;
4099
4100	STATE_LOCK(ctx);
4101	running = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING);
4102	oactive = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE);
4103	do_reset = (ctx->ifc_flags & IFC_DO_RESET);
4104	do_watchdog = (ctx->ifc_flags & IFC_DO_WATCHDOG);
4105	in_detach = (ctx->ifc_flags & IFC_IN_DETACH);
4106	ctx->ifc_flags &= ~(IFC_DO_RESET|IFC_DO_WATCHDOG);
4107	STATE_UNLOCK(ctx);
4108
4109	if ((!running && !oactive) && !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
4110		return;
4111	if (in_detach)
4112		return;
4113
4114	CTX_LOCK(ctx);
4115	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
4116		CALLOUT_LOCK(txq);
4117		callout_stop(&txq->ift_timer);
4118		CALLOUT_UNLOCK(txq);
4119	}
4120	if (do_watchdog) {
4121		ctx->ifc_watchdog_events++;
4122		IFDI_WATCHDOG_RESET(ctx);
4123	}
4124	IFDI_UPDATE_ADMIN_STATUS(ctx);
4125	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
4126		callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, txq,
4127		    txq->ift_timer.c_cpu);
4128	}
4129	IFDI_LINK_INTR_ENABLE(ctx);
4130	if (do_reset)
4131		iflib_if_init_locked(ctx);
4132	CTX_UNLOCK(ctx);
4133
4134	if (LINK_ACTIVE(ctx) == 0)
4135		return;
4136	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
4137		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
4138}
4139
4140static void
4141_task_fn_iov(void *context)
4142{
4143	if_ctx_t ctx = context;
4144
4145	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) &&
4146	    !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
4147		return;
4148
4149	CTX_LOCK(ctx);
4150	IFDI_VFLR_HANDLE(ctx);
4151	CTX_UNLOCK(ctx);
4152}
4153
4154static int
4155iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS)
4156{
4157	int err;
4158	if_int_delay_info_t info;
4159	if_ctx_t ctx;
4160
4161	info = (if_int_delay_info_t)arg1;
4162	ctx = info->iidi_ctx;
4163	info->iidi_req = req;
4164	info->iidi_oidp = oidp;
4165	CTX_LOCK(ctx);
4166	err = IFDI_SYSCTL_INT_DELAY(ctx, info);
4167	CTX_UNLOCK(ctx);
4168	return (err);
4169}
4170
4171/*********************************************************************
4172 *
4173 *  IFNET FUNCTIONS
4174 *
4175 **********************************************************************/
4176
4177static void
4178iflib_if_init_locked(if_ctx_t ctx)
4179{
4180	iflib_stop(ctx);
4181	iflib_init_locked(ctx);
4182}
4183
4184static void
4185iflib_if_init(void *arg)
4186{
4187	if_ctx_t ctx = arg;
4188
4189	CTX_LOCK(ctx);
4190	iflib_if_init_locked(ctx);
4191	CTX_UNLOCK(ctx);
4192}
4193
4194static int
4195iflib_if_transmit(if_t ifp, struct mbuf *m)
4196{
4197	if_ctx_t	ctx = if_getsoftc(ifp);
4198
4199	iflib_txq_t txq;
4200	int err, qidx;
4201	int abdicate = ctx->ifc_sysctl_tx_abdicate;
4202
4203	if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) {
4204		DBG_COUNTER_INC(tx_frees);
4205		m_freem(m);
4206		return (ENETDOWN);
4207	}
4208
4209	MPASS(m->m_nextpkt == NULL);
4210	/* ALTQ-enabled interfaces always use queue 0. */
4211	qidx = 0;
4212	if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m) && !ALTQ_IS_ENABLED(&ifp->if_snd))
4213		qidx = QIDX(ctx, m);
4214	/*
4215	 * XXX calculate buf_ring based on flowid (divvy up bits?)
4216	 */
4217	txq = &ctx->ifc_txqs[qidx];
4218
4219#ifdef DRIVER_BACKPRESSURE
4220	if (txq->ift_closed) {
4221		while (m != NULL) {
4222			next = m->m_nextpkt;
4223			m->m_nextpkt = NULL;
4224			m_freem(m);
4225			DBG_COUNTER_INC(tx_frees);
4226			m = next;
4227		}
4228		return (ENOBUFS);
4229	}
4230#endif
4231#ifdef notyet
4232	qidx = count = 0;
4233	mp = marr;
4234	next = m;
4235	do {
4236		count++;
4237		next = next->m_nextpkt;
4238	} while (next != NULL);
4239
4240	if (count > nitems(marr))
4241		if ((mp = malloc(count*sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) {
4242			/* XXX check nextpkt */
4243			m_freem(m);
4244			/* XXX simplify for now */
4245			DBG_COUNTER_INC(tx_frees);
4246			return (ENOBUFS);
4247		}
4248	for (next = m, i = 0; next != NULL; i++) {
4249		mp[i] = next;
4250		next = next->m_nextpkt;
4251		mp[i]->m_nextpkt = NULL;
4252	}
4253#endif
4254	DBG_COUNTER_INC(tx_seen);
4255	err = ifmp_ring_enqueue(txq->ift_br, (void **)&m, 1, TX_BATCH_SIZE, abdicate);
4256
4257	if (abdicate)
4258		GROUPTASK_ENQUEUE(&txq->ift_task);
4259 	if (err) {
4260		if (!abdicate)
4261			GROUPTASK_ENQUEUE(&txq->ift_task);
4262		/* support forthcoming later */
4263#ifdef DRIVER_BACKPRESSURE
4264		txq->ift_closed = TRUE;
4265#endif
4266		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
4267		m_freem(m);
4268		DBG_COUNTER_INC(tx_frees);
4269	}
4270
4271	return (err);
4272}
4273
4274#ifdef ALTQ
4275/*
4276 * The overall approach to integrating iflib with ALTQ is to continue to use
4277 * the iflib mp_ring machinery between the ALTQ queue(s) and the hardware
4278 * ring.  Technically, when using ALTQ, queueing to an intermediate mp_ring
4279 * is redundant/unnecessary, but doing so minimizes the amount of
4280 * ALTQ-specific code required in iflib.  It is assumed that the overhead of
4281 * redundantly queueing to an intermediate mp_ring is swamped by the
4282 * performance limitations inherent in using ALTQ.
4283 *
4284 * When ALTQ support is compiled in, all iflib drivers will use a transmit
4285 * routine, iflib_altq_if_transmit(), that checks if ALTQ is enabled for the
4286 * given interface.  If ALTQ is enabled for an interface, then all
4287 * transmitted packets for that interface will be submitted to the ALTQ
4288 * subsystem via IFQ_ENQUEUE().  We don't use the legacy if_transmit()
4289 * implementation because it uses IFQ_HANDOFF(), which will duplicatively
4290 * update stats that the iflib machinery handles, and which is sensitve to
4291 * the disused IFF_DRV_OACTIVE flag.  Additionally, iflib_altq_if_start()
4292 * will be installed as the start routine for use by ALTQ facilities that
4293 * need to trigger queue drains on a scheduled basis.
4294 *
4295 */
4296static void
4297iflib_altq_if_start(if_t ifp)
4298{
4299	struct ifaltq *ifq = &ifp->if_snd;
4300	struct mbuf *m;
4301
4302	IFQ_LOCK(ifq);
4303	IFQ_DEQUEUE_NOLOCK(ifq, m);
4304	while (m != NULL) {
4305		iflib_if_transmit(ifp, m);
4306		IFQ_DEQUEUE_NOLOCK(ifq, m);
4307	}
4308	IFQ_UNLOCK(ifq);
4309}
4310
4311static int
4312iflib_altq_if_transmit(if_t ifp, struct mbuf *m)
4313{
4314	int err;
4315
4316	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
4317		IFQ_ENQUEUE(&ifp->if_snd, m, err);
4318		if (err == 0)
4319			iflib_altq_if_start(ifp);
4320	} else
4321		err = iflib_if_transmit(ifp, m);
4322
4323	return (err);
4324}
4325#endif /* ALTQ */
4326
4327static void
4328iflib_if_qflush(if_t ifp)
4329{
4330	if_ctx_t ctx = if_getsoftc(ifp);
4331	iflib_txq_t txq = ctx->ifc_txqs;
4332	int i;
4333
4334	STATE_LOCK(ctx);
4335	ctx->ifc_flags |= IFC_QFLUSH;
4336	STATE_UNLOCK(ctx);
4337	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
4338		while (!(ifmp_ring_is_idle(txq->ift_br) || ifmp_ring_is_stalled(txq->ift_br)))
4339			iflib_txq_check_drain(txq, 0);
4340	STATE_LOCK(ctx);
4341	ctx->ifc_flags &= ~IFC_QFLUSH;
4342	STATE_UNLOCK(ctx);
4343
4344	/*
4345	 * When ALTQ is enabled, this will also take care of purging the
4346	 * ALTQ queue(s).
4347	 */
4348	if_qflush(ifp);
4349}
4350
4351#define IFCAP_FLAGS (IFCAP_HWCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \
4352		     IFCAP_TSO | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \
4353		     IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | \
4354		     IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWCSUM | IFCAP_MEXTPG)
4355
4356static int
4357iflib_if_ioctl(if_t ifp, u_long command, caddr_t data)
4358{
4359	if_ctx_t ctx = if_getsoftc(ifp);
4360	struct ifreq	*ifr = (struct ifreq *)data;
4361#if defined(INET) || defined(INET6)
4362	struct ifaddr	*ifa = (struct ifaddr *)data;
4363#endif
4364	bool		avoid_reset = false;
4365	int		err = 0, reinit = 0, bits;
4366
4367	switch (command) {
4368	case SIOCSIFADDR:
4369#ifdef INET
4370		if (ifa->ifa_addr->sa_family == AF_INET)
4371			avoid_reset = true;
4372#endif
4373#ifdef INET6
4374		if (ifa->ifa_addr->sa_family == AF_INET6)
4375			avoid_reset = true;
4376#endif
4377		/*
4378		** Calling init results in link renegotiation,
4379		** so we avoid doing it when possible.
4380		*/
4381		if (avoid_reset) {
4382			if_setflagbits(ifp, IFF_UP,0);
4383			if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
4384				reinit = 1;
4385#ifdef INET
4386			if (!(if_getflags(ifp) & IFF_NOARP))
4387				arp_ifinit(ifp, ifa);
4388#endif
4389		} else
4390			err = ether_ioctl(ifp, command, data);
4391		break;
4392	case SIOCSIFMTU:
4393		CTX_LOCK(ctx);
4394		if (ifr->ifr_mtu == if_getmtu(ifp)) {
4395			CTX_UNLOCK(ctx);
4396			break;
4397		}
4398		bits = if_getdrvflags(ifp);
4399		/* stop the driver and free any clusters before proceeding */
4400		iflib_stop(ctx);
4401
4402		if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) {
4403			STATE_LOCK(ctx);
4404			if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size)
4405				ctx->ifc_flags |= IFC_MULTISEG;
4406			else
4407				ctx->ifc_flags &= ~IFC_MULTISEG;
4408			STATE_UNLOCK(ctx);
4409			err = if_setmtu(ifp, ifr->ifr_mtu);
4410		}
4411		iflib_init_locked(ctx);
4412		STATE_LOCK(ctx);
4413		if_setdrvflags(ifp, bits);
4414		STATE_UNLOCK(ctx);
4415		CTX_UNLOCK(ctx);
4416		break;
4417	case SIOCSIFFLAGS:
4418		CTX_LOCK(ctx);
4419		if (if_getflags(ifp) & IFF_UP) {
4420			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
4421				if ((if_getflags(ifp) ^ ctx->ifc_if_flags) &
4422				    (IFF_PROMISC | IFF_ALLMULTI)) {
4423					CTX_UNLOCK(ctx);
4424					err = IFDI_PROMISC_SET(ctx, if_getflags(ifp));
4425					CTX_LOCK(ctx);
4426				}
4427			} else
4428				reinit = 1;
4429		} else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
4430			iflib_stop(ctx);
4431		}
4432		ctx->ifc_if_flags = if_getflags(ifp);
4433		CTX_UNLOCK(ctx);
4434		break;
4435	case SIOCADDMULTI:
4436	case SIOCDELMULTI:
4437		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
4438			CTX_LOCK(ctx);
4439			IFDI_INTR_DISABLE(ctx);
4440			IFDI_MULTI_SET(ctx);
4441			IFDI_INTR_ENABLE(ctx);
4442			CTX_UNLOCK(ctx);
4443		}
4444		break;
4445	case SIOCSIFMEDIA:
4446		CTX_LOCK(ctx);
4447		IFDI_MEDIA_SET(ctx);
4448		CTX_UNLOCK(ctx);
4449		/* FALLTHROUGH */
4450	case SIOCGIFMEDIA:
4451#ifndef __HAIKU__
4452	case SIOCGIFXMEDIA:
4453#endif
4454		err = ifmedia_ioctl(ifp, ifr, ctx->ifc_mediap, command);
4455		break;
4456#ifndef __HAIKU__
4457	case SIOCGI2C:
4458	{
4459		struct ifi2creq i2c;
4460
4461		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4462		if (err != 0)
4463			break;
4464		if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
4465			err = EINVAL;
4466			break;
4467		}
4468		if (i2c.len > sizeof(i2c.data)) {
4469			err = EINVAL;
4470			break;
4471		}
4472
4473		if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0)
4474			err = copyout(&i2c, ifr_data_get_ptr(ifr),
4475			    sizeof(i2c));
4476		break;
4477	}
4478#endif
4479	case SIOCSIFCAP:
4480	{
4481		int mask, setmask, oldmask;
4482
4483		oldmask = if_getcapenable(ifp);
4484		mask = ifr->ifr_reqcap ^ oldmask;
4485		mask &= ctx->ifc_softc_ctx.isc_capabilities | IFCAP_MEXTPG;
4486		setmask = 0;
4487#ifdef TCP_OFFLOAD
4488		setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6);
4489#endif
4490		setmask |= (mask & IFCAP_FLAGS);
4491		setmask |= (mask & IFCAP_WOL);
4492
4493		/*
4494		 * If any RX csum has changed, change all the ones that
4495		 * are supported by the driver.
4496		 */
4497		if (setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
4498			setmask |= ctx->ifc_softc_ctx.isc_capabilities &
4499			    (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6);
4500		}
4501
4502		/*
4503		 * want to ensure that traffic has stopped before we change any of the flags
4504		 */
4505		if (setmask) {
4506			CTX_LOCK(ctx);
4507			bits = if_getdrvflags(ifp);
4508			if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
4509				iflib_stop(ctx);
4510			STATE_LOCK(ctx);
4511			if_togglecapenable(ifp, setmask);
4512			STATE_UNLOCK(ctx);
4513			if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
4514				iflib_init_locked(ctx);
4515			STATE_LOCK(ctx);
4516			if_setdrvflags(ifp, bits);
4517			STATE_UNLOCK(ctx);
4518			CTX_UNLOCK(ctx);
4519		}
4520		if_vlancap(ifp);
4521		break;
4522	}
4523	case SIOCGPRIVATE_0:
4524	case SIOCSDRVSPEC:
4525	case SIOCGDRVSPEC:
4526		CTX_LOCK(ctx);
4527		err = IFDI_PRIV_IOCTL(ctx, command, data);
4528		CTX_UNLOCK(ctx);
4529		break;
4530	default:
4531		err = ether_ioctl(ifp, command, data);
4532		break;
4533	}
4534	if (reinit)
4535		iflib_if_init(ctx);
4536	return (err);
4537}
4538
4539static uint64_t
4540iflib_if_get_counter(if_t ifp, ift_counter cnt)
4541{
4542	if_ctx_t ctx = if_getsoftc(ifp);
4543
4544	return (IFDI_GET_COUNTER(ctx, cnt));
4545}
4546
4547/*********************************************************************
4548 *
4549 *  OTHER FUNCTIONS EXPORTED TO THE STACK
4550 *
4551 **********************************************************************/
4552
4553static void
4554iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag)
4555{
4556	if_ctx_t ctx = if_getsoftc(ifp);
4557
4558	if ((void *)ctx != arg)
4559		return;
4560
4561	if ((vtag == 0) || (vtag > 4095))
4562		return;
4563
4564	if (iflib_in_detach(ctx))
4565		return;
4566
4567	CTX_LOCK(ctx);
4568	/* Driver may need all untagged packets to be flushed */
4569	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
4570		iflib_stop(ctx);
4571	IFDI_VLAN_REGISTER(ctx, vtag);
4572	/* Re-init to load the changes, if required */
4573	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
4574		iflib_init_locked(ctx);
4575	CTX_UNLOCK(ctx);
4576}
4577
4578static void
4579iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag)
4580{
4581	if_ctx_t ctx = if_getsoftc(ifp);
4582
4583	if ((void *)ctx != arg)
4584		return;
4585
4586	if ((vtag == 0) || (vtag > 4095))
4587		return;
4588
4589	CTX_LOCK(ctx);
4590	/* Driver may need all tagged packets to be flushed */
4591	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
4592		iflib_stop(ctx);
4593	IFDI_VLAN_UNREGISTER(ctx, vtag);
4594	/* Re-init to load the changes, if required */
4595	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
4596		iflib_init_locked(ctx);
4597	CTX_UNLOCK(ctx);
4598}
4599
4600static void
4601iflib_led_func(void *arg, int onoff)
4602{
4603	if_ctx_t ctx = arg;
4604
4605	CTX_LOCK(ctx);
4606	IFDI_LED_FUNC(ctx, onoff);
4607	CTX_UNLOCK(ctx);
4608}
4609
4610/*********************************************************************
4611 *
4612 *  BUS FUNCTION DEFINITIONS
4613 *
4614 **********************************************************************/
4615
4616int
4617iflib_device_probe(device_t dev)
4618{
4619	const pci_vendor_info_t *ent;
4620	if_shared_ctx_t sctx;
4621	uint16_t pci_device_id, pci_rev_id, pci_subdevice_id, pci_subvendor_id;
4622	uint16_t pci_vendor_id;
4623
4624	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
4625		return (ENOTSUP);
4626
4627	pci_vendor_id = pci_get_vendor(dev);
4628	pci_device_id = pci_get_device(dev);
4629	pci_subvendor_id = pci_get_subvendor(dev);
4630	pci_subdevice_id = pci_get_subdevice(dev);
4631	pci_rev_id = pci_get_revid(dev);
4632	if (sctx->isc_parse_devinfo != NULL)
4633		sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id);
4634
4635	ent = sctx->isc_vendor_info;
4636	while (ent->pvi_vendor_id != 0) {
4637		if (pci_vendor_id != ent->pvi_vendor_id) {
4638			ent++;
4639			continue;
4640		}
4641		if ((pci_device_id == ent->pvi_device_id) &&
4642		    ((pci_subvendor_id == ent->pvi_subvendor_id) ||
4643		     (ent->pvi_subvendor_id == 0)) &&
4644		    ((pci_subdevice_id == ent->pvi_subdevice_id) ||
4645		     (ent->pvi_subdevice_id == 0)) &&
4646		    ((pci_rev_id == ent->pvi_rev_id) ||
4647		     (ent->pvi_rev_id == 0))) {
4648			device_set_desc_copy(dev, ent->pvi_name);
4649			/* this needs to be changed to zero if the bus probing code
4650			 * ever stops re-probing on best match because the sctx
4651			 * may have its values over written by register calls
4652			 * in subsequent probes
4653			 */
4654			return (BUS_PROBE_DEFAULT);
4655		}
4656		ent++;
4657	}
4658	return (ENXIO);
4659}
4660
4661int
4662iflib_device_probe_vendor(device_t dev)
4663{
4664	int probe;
4665
4666	probe = iflib_device_probe(dev);
4667#ifndef __HAIKU__
4668	if (probe == BUS_PROBE_DEFAULT)
4669		return (BUS_PROBE_VENDOR);
4670	else
4671#endif
4672		return (probe);
4673}
4674
4675static void
4676iflib_reset_qvalues(if_ctx_t ctx)
4677{
4678	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
4679	if_shared_ctx_t sctx = ctx->ifc_sctx;
4680	device_t dev = ctx->ifc_dev;
4681	int i;
4682
4683	if (ctx->ifc_sysctl_ntxqs != 0)
4684		scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs;
4685	if (ctx->ifc_sysctl_nrxqs != 0)
4686		scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs;
4687
4688	for (i = 0; i < sctx->isc_ntxqs; i++) {
4689		if (ctx->ifc_sysctl_ntxds[i] != 0)
4690			scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i];
4691		else
4692			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
4693	}
4694
4695	for (i = 0; i < sctx->isc_nrxqs; i++) {
4696		if (ctx->ifc_sysctl_nrxds[i] != 0)
4697			scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i];
4698		else
4699			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
4700	}
4701
4702	for (i = 0; i < sctx->isc_nrxqs; i++) {
4703		if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) {
4704			device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n",
4705				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]);
4706			scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i];
4707		}
4708		if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) {
4709			device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n",
4710				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]);
4711			scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i];
4712		}
4713		if (!powerof2(scctx->isc_nrxd[i])) {
4714			device_printf(dev, "nrxd%d: %d is not a power of 2 - using default value of %d\n",
4715				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_default[i]);
4716			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
4717		}
4718	}
4719
4720	for (i = 0; i < sctx->isc_ntxqs; i++) {
4721		if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) {
4722			device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n",
4723				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]);
4724			scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i];
4725		}
4726		if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) {
4727			device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n",
4728				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]);
4729			scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i];
4730		}
4731		if (!powerof2(scctx->isc_ntxd[i])) {
4732			device_printf(dev, "ntxd%d: %d is not a power of 2 - using default value of %d\n",
4733				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_default[i]);
4734			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
4735		}
4736	}
4737}
4738
4739static void
4740iflib_add_pfil(if_ctx_t ctx)
4741{
4742#ifndef __HAIKU__
4743	struct pfil_head *pfil;
4744	struct pfil_head_args pa;
4745	iflib_rxq_t rxq;
4746	int i;
4747
4748	pa.pa_version = PFIL_VERSION;
4749	pa.pa_flags = PFIL_IN;
4750	pa.pa_type = PFIL_TYPE_ETHERNET;
4751	pa.pa_headname = ctx->ifc_ifp->if_xname;
4752	pfil = pfil_head_register(&pa);
4753
4754	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
4755		rxq->pfil = pfil;
4756	}
4757#endif
4758}
4759
4760static void
4761iflib_rem_pfil(if_ctx_t ctx)
4762{
4763#ifndef __HAIKU__
4764	struct pfil_head *pfil;
4765	iflib_rxq_t rxq;
4766	int i;
4767
4768	rxq = ctx->ifc_rxqs;
4769	pfil = rxq->pfil;
4770	for (i = 0; i < NRXQSETS(ctx); i++, rxq++) {
4771		rxq->pfil = NULL;
4772	}
4773	pfil_head_unregister(pfil);
4774#endif
4775}
4776
4777
4778#ifndef __HAIKU__
4779/*
4780 * Advance forward by n members of the cpuset ctx->ifc_cpus starting from
4781 * cpuid and wrapping as necessary.
4782 */
4783static unsigned int
4784cpuid_advance(if_ctx_t ctx, unsigned int cpuid, unsigned int n)
4785{
4786	unsigned int first_valid;
4787	unsigned int last_valid;
4788
4789	/* cpuid should always be in the valid set */
4790	MPASS(CPU_ISSET(cpuid, &ctx->ifc_cpus));
4791
4792	/* valid set should never be empty */
4793	MPASS(!CPU_EMPTY(&ctx->ifc_cpus));
4794
4795	first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
4796	last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
4797	n = n % CPU_COUNT(&ctx->ifc_cpus);
4798	while (n > 0) {
4799		do {
4800			cpuid++;
4801			if (cpuid > last_valid)
4802				cpuid = first_valid;
4803		} while (!CPU_ISSET(cpuid, &ctx->ifc_cpus));
4804		n--;
4805	}
4806
4807	return (cpuid);
4808}
4809#endif
4810
4811#if defined(SMP) && defined(SCHED_ULE)
4812extern struct cpu_group *cpu_top;              /* CPU topology */
4813
4814static int
4815find_child_with_core(int cpu, struct cpu_group *grp)
4816{
4817	int i;
4818
4819	if (grp->cg_children == 0)
4820		return -1;
4821
4822	MPASS(grp->cg_child);
4823	for (i = 0; i < grp->cg_children; i++) {
4824		if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask))
4825			return i;
4826	}
4827
4828	return -1;
4829}
4830
4831
4832/*
4833 * Find an L2 neighbor of the given CPU or return -1 if none found.  This
4834 * does not distinguish among multiple L2 neighbors if the given CPU has
4835 * more than one (it will always return the same result in that case).
4836 */
4837static int
4838find_l2_neighbor(int cpu)
4839{
4840	struct cpu_group *grp;
4841	int i;
4842
4843	grp = cpu_top;
4844	if (grp == NULL)
4845		return -1;
4846
4847	/*
4848	 * Find the smallest CPU group that contains the given core.
4849	 */
4850	i = 0;
4851	while ((i = find_child_with_core(cpu, grp)) != -1) {
4852		/*
4853		 * If the smallest group containing the given CPU has less
4854		 * than two members, we conclude the given CPU has no
4855		 * L2 neighbor.
4856		 */
4857		if (grp->cg_child[i].cg_count <= 1)
4858			return (-1);
4859		grp = &grp->cg_child[i];
4860	}
4861
4862	/* Must share L2. */
4863	if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE)
4864		return -1;
4865
4866	/*
4867	 * Select the first member of the set that isn't the reference
4868	 * CPU, which at this point is guaranteed to exist.
4869	 */
4870	for (i = 0; i < CPU_SETSIZE; i++) {
4871		if (CPU_ISSET(i, &grp->cg_mask) && i != cpu)
4872			return (i);
4873	}
4874
4875	/* Should never be reached */
4876	return (-1);
4877}
4878
4879#else
4880static int
4881find_l2_neighbor(int cpu)
4882{
4883
4884	return (-1);
4885}
4886#endif
4887
4888#ifndef __HAIKU__
4889/*
4890 * CPU mapping behaviors
4891 * ---------------------
4892 * 'separate txrx' refers to the separate_txrx sysctl
4893 * 'use logical' refers to the use_logical_cores sysctl
4894 * 'INTR CPUS' indicates whether bus_get_cpus(INTR_CPUS) succeeded
4895 *
4896 *  separate     use     INTR
4897 *    txrx     logical   CPUS   result
4898 * ---------- --------- ------ ------------------------------------------------
4899 *     -          -       X     RX and TX queues mapped to consecutive physical
4900 *                              cores with RX/TX pairs on same core and excess
4901 *                              of either following
4902 *     -          X       X     RX and TX queues mapped to consecutive cores
4903 *                              of any type with RX/TX pairs on same core and
4904 *                              excess of either following
4905 *     X          -       X     RX and TX queues mapped to consecutive physical
4906 *                              cores; all RX then all TX
4907 *     X          X       X     RX queues mapped to consecutive physical cores
4908 *                              first, then TX queues mapped to L2 neighbor of
4909 *                              the corresponding RX queue if one exists,
4910 *                              otherwise to consecutive physical cores
4911 *     -         n/a      -     RX and TX queues mapped to consecutive cores of
4912 *                              any type with RX/TX pairs on same core and excess
4913 *                              of either following
4914 *     X         n/a      -     RX and TX queues mapped to consecutive cores of
4915 *                              any type; all RX then all TX
4916 */
4917static unsigned int
4918get_cpuid_for_queue(if_ctx_t ctx, unsigned int base_cpuid, unsigned int qid,
4919    bool is_tx)
4920{
4921	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
4922	unsigned int core_index;
4923
4924	if (ctx->ifc_sysctl_separate_txrx) {
4925		/*
4926		 * When using separate CPUs for TX and RX, the assignment
4927		 * will always be of a consecutive CPU out of the set of
4928		 * context CPUs, except for the specific case where the
4929		 * context CPUs are phsyical cores, the use of logical cores
4930		 * has been enabled, the assignment is for TX, the TX qid
4931		 * corresponds to an RX qid, and the CPU assigned to the
4932		 * corresponding RX queue has an L2 neighbor.
4933		 */
4934		if (ctx->ifc_sysctl_use_logical_cores &&
4935		    ctx->ifc_cpus_are_physical_cores &&
4936		    is_tx && qid < scctx->isc_nrxqsets) {
4937			int l2_neighbor;
4938			unsigned int rx_cpuid;
4939
4940			rx_cpuid = cpuid_advance(ctx, base_cpuid, qid);
4941			l2_neighbor = find_l2_neighbor(rx_cpuid);
4942			if (l2_neighbor != -1) {
4943				return (l2_neighbor);
4944			}
4945			/*
4946			 * ... else fall through to the normal
4947			 * consecutive-after-RX assignment scheme.
4948			 *
4949			 * Note that we are assuming that all RX queue CPUs
4950			 * have an L2 neighbor, or all do not.  If a mixed
4951			 * scenario is possible, we will have to keep track
4952			 * separately of how many queues prior to this one
4953			 * were not able to be assigned to an L2 neighbor.
4954			 */
4955		}
4956		if (is_tx)
4957			core_index = scctx->isc_nrxqsets + qid;
4958		else
4959			core_index = qid;
4960	} else {
4961		core_index = qid;
4962	}
4963
4964	return (cpuid_advance(ctx, base_cpuid, core_index));
4965}
4966#else
4967#define get_cpuid_for_queue(...) CPU_FIRST()
4968#endif
4969
4970static uint16_t
4971get_ctx_core_offset(if_ctx_t ctx)
4972{
4973#ifndef __HAIKU__
4974	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
4975	struct cpu_offset *op;
4976	cpuset_t assigned_cpus;
4977	unsigned int cores_consumed;
4978	unsigned int base_cpuid = ctx->ifc_sysctl_core_offset;
4979	unsigned int first_valid;
4980	unsigned int last_valid;
4981	unsigned int i;
4982
4983	first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
4984	last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
4985
4986	if (base_cpuid != CORE_OFFSET_UNSPECIFIED) {
4987		/*
4988		 * Align the user-chosen base CPU ID to the next valid CPU
4989		 * for this device.  If the chosen base CPU ID is smaller
4990		 * than the first valid CPU or larger than the last valid
4991		 * CPU, we assume the user does not know what the valid
4992		 * range is for this device and is thinking in terms of a
4993		 * zero-based reference frame, and so we shift the given
4994		 * value into the valid range (and wrap accordingly) so the
4995		 * intent is translated to the proper frame of reference.
4996		 * If the base CPU ID is within the valid first/last, but
4997		 * does not correspond to a valid CPU, it is advanced to the
4998		 * next valid CPU (wrapping if necessary).
4999		 */
5000		if (base_cpuid < first_valid || base_cpuid > last_valid) {
5001			/* shift from zero-based to first_valid-based */
5002			base_cpuid += first_valid;
5003			/* wrap to range [first_valid, last_valid] */
5004			base_cpuid = (base_cpuid - first_valid) %
5005			    (last_valid - first_valid + 1);
5006		}
5007		if (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus)) {
5008			/*
5009			 * base_cpuid is in [first_valid, last_valid], but
5010			 * not a member of the valid set.  In this case,
5011			 * there will always be a member of the valid set
5012			 * with a CPU ID that is greater than base_cpuid,
5013			 * and we simply advance to it.
5014			 */
5015			while (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus))
5016				base_cpuid++;
5017		}
5018		return (base_cpuid);
5019	}
5020
5021	/*
5022	 * Determine how many cores will be consumed by performing the CPU
5023	 * assignments and counting how many of the assigned CPUs correspond
5024	 * to CPUs in the set of context CPUs.  This is done using the CPU
5025	 * ID first_valid as the base CPU ID, as the base CPU must be within
5026	 * the set of context CPUs.
5027	 *
5028	 * Note not all assigned CPUs will be in the set of context CPUs
5029	 * when separate CPUs are being allocated to TX and RX queues,
5030	 * assignment to logical cores has been enabled, the set of context
5031	 * CPUs contains only physical CPUs, and TX queues are mapped to L2
5032	 * neighbors of CPUs that RX queues have been mapped to - in this
5033	 * case we do only want to count how many CPUs in the set of context
5034	 * CPUs have been consumed, as that determines the next CPU in that
5035	 * set to start allocating at for the next device for which
5036	 * core_offset is not set.
5037	 */
5038	CPU_ZERO(&assigned_cpus);
5039	for (i = 0; i < scctx->isc_ntxqsets; i++)
5040		CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, true),
5041		    &assigned_cpus);
5042	for (i = 0; i < scctx->isc_nrxqsets; i++)
5043		CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, false),
5044		    &assigned_cpus);
5045	CPU_AND(&assigned_cpus, &ctx->ifc_cpus);
5046	cores_consumed = CPU_COUNT(&assigned_cpus);
5047
5048	mtx_lock(&cpu_offset_mtx);
5049	SLIST_FOREACH(op, &cpu_offsets, entries) {
5050		if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
5051			base_cpuid = op->next_cpuid;
5052			op->next_cpuid = cpuid_advance(ctx, op->next_cpuid,
5053			    cores_consumed);
5054			MPASS(op->refcount < UINT_MAX);
5055			op->refcount++;
5056			break;
5057		}
5058	}
5059	if (base_cpuid == CORE_OFFSET_UNSPECIFIED) {
5060		base_cpuid = first_valid;
5061		op = malloc(sizeof(struct cpu_offset), M_IFLIB,
5062		    M_NOWAIT | M_ZERO);
5063		if (op == NULL) {
5064			device_printf(ctx->ifc_dev,
5065			    "allocation for cpu offset failed.\n");
5066		} else {
5067			op->next_cpuid = cpuid_advance(ctx, base_cpuid,
5068			    cores_consumed);
5069			op->refcount = 1;
5070			CPU_COPY(&ctx->ifc_cpus, &op->set);
5071			SLIST_INSERT_HEAD(&cpu_offsets, op, entries);
5072		}
5073	}
5074	mtx_unlock(&cpu_offset_mtx);
5075
5076	return (base_cpuid);
5077#else
5078	return 0;
5079#endif
5080}
5081
5082static void
5083unref_ctx_core_offset(if_ctx_t ctx)
5084{
5085#ifndef __HAIKU__
5086	struct cpu_offset *op, *top;
5087
5088	mtx_lock(&cpu_offset_mtx);
5089	SLIST_FOREACH_SAFE(op, &cpu_offsets, entries, top) {
5090		if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
5091			MPASS(op->refcount > 0);
5092			op->refcount--;
5093			if (op->refcount == 0) {
5094				SLIST_REMOVE(&cpu_offsets, op, cpu_offset, entries);
5095				free(op, M_IFLIB);
5096			}
5097			break;
5098		}
5099	}
5100	mtx_unlock(&cpu_offset_mtx);
5101#endif
5102}
5103
5104int
5105iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp)
5106{
5107	if_ctx_t ctx;
5108	if_t ifp;
5109	if_softc_ctx_t scctx;
5110	kobjop_desc_t kobj_desc;
5111	kobj_method_t *kobj_method;
5112	int err, msix, rid;
5113	int num_txd, num_rxd;
5114
5115	ctx = malloc(sizeof(* ctx), M_IFLIB, M_WAITOK|M_ZERO);
5116
5117	if (sc == NULL) {
5118		sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
5119		device_set_softc(dev, ctx);
5120		ctx->ifc_flags |= IFC_SC_ALLOCATED;
5121	}
5122
5123	ctx->ifc_sctx = sctx;
5124	ctx->ifc_dev = dev;
5125	ctx->ifc_softc = sc;
5126
5127	if ((err = iflib_register(ctx)) != 0) {
5128		device_printf(dev, "iflib_register failed %d\n", err);
5129		goto fail_ctx_free;
5130	}
5131	iflib_add_device_sysctl_pre(ctx);
5132
5133	scctx = &ctx->ifc_softc_ctx;
5134	ifp = ctx->ifc_ifp;
5135
5136	iflib_reset_qvalues(ctx);
5137	CTX_LOCK(ctx);
5138	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
5139		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
5140		goto fail_unlock;
5141	}
5142	_iflib_pre_assert(scctx);
5143	ctx->ifc_txrx = *scctx->isc_txrx;
5144
5145	if (sctx->isc_flags & IFLIB_DRIVER_MEDIA)
5146		ctx->ifc_mediap = scctx->isc_media;
5147
5148#ifdef INVARIANTS
5149	if (scctx->isc_capabilities & IFCAP_TXCSUM)
5150		MPASS(scctx->isc_tx_csum_flags);
5151#endif
5152
5153	if_setcapabilities(ifp,
5154	    scctx->isc_capabilities | IFCAP_HWSTATS | IFCAP_MEXTPG);
5155	if_setcapenable(ifp,
5156	    scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_MEXTPG);
5157
5158	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
5159		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
5160	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
5161		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
5162
5163	num_txd = iflib_num_tx_descs(ctx);
5164	num_rxd = iflib_num_rx_descs(ctx);
5165
5166	/* XXX change for per-queue sizes */
5167	device_printf(dev, "Using %d TX descriptors and %d RX descriptors\n",
5168	    num_txd, num_rxd);
5169
5170	if (scctx->isc_tx_nsegments > num_txd / MAX_SINGLE_PACKET_FRACTION)
5171		scctx->isc_tx_nsegments = max(1, num_txd /
5172		    MAX_SINGLE_PACKET_FRACTION);
5173	if (scctx->isc_tx_tso_segments_max > num_txd /
5174	    MAX_SINGLE_PACKET_FRACTION)
5175		scctx->isc_tx_tso_segments_max = max(1,
5176		    num_txd / MAX_SINGLE_PACKET_FRACTION);
5177
5178	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
5179	if (if_getcapabilities(ifp) & IFCAP_TSO) {
5180#ifndef __HAIKU__
5181		/*
5182		 * The stack can't handle a TSO size larger than IP_MAXPACKET,
5183		 * but some MACs do.
5184		 */
5185		if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max,
5186		    IP_MAXPACKET));
5187		/*
5188		 * Take maximum number of m_pullup(9)'s in iflib_parse_header()
5189		 * into account.  In the worst case, each of these calls will
5190		 * add another mbuf and, thus, the requirement for another DMA
5191		 * segment.  So for best performance, it doesn't make sense to
5192		 * advertize a maximum of TSO segments that typically will
5193		 * require defragmentation in iflib_encap().
5194		 */
5195		if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3);
5196		if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max);
5197#endif
5198	}
5199	if (scctx->isc_rss_table_size == 0)
5200		scctx->isc_rss_table_size = 64;
5201	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
5202
5203	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
5204	/* XXX format name */
5205	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx,
5206	    NULL, NULL, "admin");
5207
5208#ifndef __HAIKU__
5209	/* Set up cpu set.  If it fails, use the set of all CPUs. */
5210	if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) != 0) {
5211		device_printf(dev, "Unable to fetch CPU list\n");
5212		CPU_COPY(&all_cpus, &ctx->ifc_cpus);
5213		ctx->ifc_cpus_are_physical_cores = false;
5214	} else
5215		ctx->ifc_cpus_are_physical_cores = true;
5216	MPASS(CPU_COUNT(&ctx->ifc_cpus) > 0);
5217#endif
5218
5219	/*
5220	** Now set up MSI or MSI-X, should return us the number of supported
5221	** vectors (will be 1 for a legacy interrupt and MSI).
5222	*/
5223	if (sctx->isc_flags & IFLIB_SKIP_MSIX) {
5224		msix = scctx->isc_vectors;
5225	} else if (scctx->isc_msix_bar != 0)
5226	       /*
5227		* The simple fact that isc_msix_bar is not 0 does not mean we
5228		* we have a good value there that is known to work.
5229		*/
5230		msix = iflib_msix_init(ctx);
5231	else {
5232		scctx->isc_vectors = 1;
5233		scctx->isc_ntxqsets = 1;
5234		scctx->isc_nrxqsets = 1;
5235		scctx->isc_intr = IFLIB_INTR_LEGACY;
5236		msix = 0;
5237	}
5238	/* Get memory for the station queues */
5239	if ((err = iflib_queues_alloc(ctx))) {
5240		device_printf(dev, "Unable to allocate queue memory\n");
5241		goto fail_intr_free;
5242	}
5243
5244	if ((err = iflib_qset_structures_setup(ctx)))
5245		goto fail_queues;
5246
5247	/*
5248	 * Now that we know how many queues there are, get the core offset.
5249	 */
5250	ctx->ifc_sysctl_core_offset = get_ctx_core_offset(ctx);
5251
5252	if (msix > 1) {
5253		/*
5254		 * When using MSI-X, ensure that ifdi_{r,t}x_queue_intr_enable
5255		 * aren't the default NULL implementation.
5256		 */
5257		kobj_desc = &ifdi_rx_queue_intr_enable_desc;
5258#ifdef __HAIKU__
5259		kobj_method = kobj_lookup_method(ctx->ops.cls, NULL,
5260#else
5261		kobj_method = kobj_lookup_method(((kobj_t)ctx)->ops->cls, NULL,
5262#endif
5263		    kobj_desc);
5264		if (kobj_method == &kobj_desc->deflt) {
5265			device_printf(dev,
5266			    "MSI-X requires ifdi_rx_queue_intr_enable method");
5267			err = EOPNOTSUPP;
5268			goto fail_queues;
5269		}
5270		kobj_desc = &ifdi_tx_queue_intr_enable_desc;
5271#ifdef __HAIKU__
5272		kobj_method = kobj_lookup_method(ctx->ops.cls, NULL,
5273#else
5274		kobj_method = kobj_lookup_method(((kobj_t)ctx)->ops->cls, NULL,
5275#endif
5276		    kobj_desc);
5277		if (kobj_method == &kobj_desc->deflt) {
5278			device_printf(dev,
5279			    "MSI-X requires ifdi_tx_queue_intr_enable method");
5280			err = EOPNOTSUPP;
5281			goto fail_queues;
5282		}
5283
5284		/*
5285		 * Assign the MSI-X vectors.
5286		 * Note that the default NULL ifdi_msix_intr_assign method will
5287		 * fail here, too.
5288		 */
5289		err = IFDI_MSIX_INTR_ASSIGN(ctx, msix);
5290		if (err != 0) {
5291			device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n",
5292			    err);
5293			goto fail_queues;
5294		}
5295	} else if (scctx->isc_intr != IFLIB_INTR_MSIX) {
5296		rid = 0;
5297		if (scctx->isc_intr == IFLIB_INTR_MSI) {
5298			MPASS(msix == 1);
5299			rid = 1;
5300		}
5301		if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) {
5302			device_printf(dev, "iflib_legacy_setup failed %d\n", err);
5303			goto fail_queues;
5304		}
5305	} else {
5306		device_printf(dev,
5307		    "Cannot use iflib with only 1 MSI-X interrupt!\n");
5308		err = ENODEV;
5309		goto fail_queues;
5310	}
5311
5312	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
5313
5314	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
5315		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
5316		goto fail_detach;
5317	}
5318
5319	/*
5320	 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
5321	 * This must appear after the call to ether_ifattach() because
5322	 * ether_ifattach() sets if_hdrlen to the default value.
5323	 */
5324	if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
5325		if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
5326
5327	if ((err = iflib_netmap_attach(ctx))) {
5328		device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err);
5329		goto fail_detach;
5330	}
5331	*ctxp = ctx;
5332
5333	DEBUGNET_SET(ctx->ifc_ifp, iflib);
5334
5335	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
5336	iflib_add_device_sysctl_post(ctx);
5337	iflib_add_pfil(ctx);
5338	ctx->ifc_flags |= IFC_INIT_DONE;
5339	CTX_UNLOCK(ctx);
5340
5341	return (0);
5342
5343fail_detach:
5344	ether_ifdetach(ctx->ifc_ifp);
5345fail_queues:
5346	iflib_tqg_detach(ctx);
5347	iflib_tx_structures_free(ctx);
5348	iflib_rx_structures_free(ctx);
5349	IFDI_DETACH(ctx);
5350	IFDI_QUEUES_FREE(ctx);
5351fail_intr_free:
5352	iflib_free_intr_mem(ctx);
5353fail_unlock:
5354	CTX_UNLOCK(ctx);
5355	iflib_deregister(ctx);
5356fail_ctx_free:
5357	device_set_softc(ctx->ifc_dev, NULL);
5358        if (ctx->ifc_flags & IFC_SC_ALLOCATED)
5359                free(ctx->ifc_softc, M_IFLIB);
5360        free(ctx, M_IFLIB);
5361	return (err);
5362}
5363
5364int
5365iflib_pseudo_register(device_t dev, if_shared_ctx_t sctx, if_ctx_t *ctxp,
5366					  struct iflib_cloneattach_ctx *clctx)
5367{
5368	int num_txd, num_rxd;
5369	int err;
5370	if_ctx_t ctx;
5371	if_t ifp;
5372	if_softc_ctx_t scctx;
5373	int i;
5374	void *sc;
5375
5376	ctx = malloc(sizeof(*ctx), M_IFLIB, M_WAITOK|M_ZERO);
5377	sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
5378	ctx->ifc_flags |= IFC_SC_ALLOCATED;
5379	if (sctx->isc_flags & (IFLIB_PSEUDO|IFLIB_VIRTUAL))
5380		ctx->ifc_flags |= IFC_PSEUDO;
5381
5382	ctx->ifc_sctx = sctx;
5383	ctx->ifc_softc = sc;
5384	ctx->ifc_dev = dev;
5385
5386	if ((err = iflib_register(ctx)) != 0) {
5387		device_printf(dev, "%s: iflib_register failed %d\n", __func__, err);
5388		goto fail_ctx_free;
5389	}
5390	iflib_add_device_sysctl_pre(ctx);
5391
5392	scctx = &ctx->ifc_softc_ctx;
5393	ifp = ctx->ifc_ifp;
5394
5395	iflib_reset_qvalues(ctx);
5396	CTX_LOCK(ctx);
5397	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
5398		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
5399		goto fail_unlock;
5400	}
5401#ifndef __HAIKU__
5402	if (sctx->isc_flags & IFLIB_GEN_MAC)
5403		ether_gen_addr(ifp, &ctx->ifc_mac);
5404#endif
5405	if ((err = IFDI_CLONEATTACH(ctx, clctx->cc_ifc, clctx->cc_name,
5406								clctx->cc_params)) != 0) {
5407		device_printf(dev, "IFDI_CLONEATTACH failed %d\n", err);
5408		goto fail_unlock;
5409	}
5410#ifdef INVARIANTS
5411	if (scctx->isc_capabilities & IFCAP_TXCSUM)
5412		MPASS(scctx->isc_tx_csum_flags);
5413#endif
5414
5415	if_setcapabilities(ifp, scctx->isc_capabilities | IFCAP_HWSTATS | IFCAP_LINKSTATE);
5416	if_setcapenable(ifp, scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_LINKSTATE);
5417
5418	ifp->if_flags |= IFF_NOGROUP;
5419	if (sctx->isc_flags & IFLIB_PSEUDO) {
5420		ifmedia_add(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO, 0, NULL);
5421		ifmedia_set(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO);
5422		if (sctx->isc_flags & IFLIB_PSEUDO_ETHER) {
5423			ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
5424		} else {
5425			if_attach(ctx->ifc_ifp);
5426			bpfattach(ctx->ifc_ifp, DLT_NULL, sizeof(u_int32_t));
5427		}
5428
5429		if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
5430			device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
5431			goto fail_detach;
5432		}
5433		*ctxp = ctx;
5434
5435		/*
5436		 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
5437		 * This must appear after the call to ether_ifattach() because
5438		 * ether_ifattach() sets if_hdrlen to the default value.
5439		 */
5440		if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
5441			if_setifheaderlen(ifp,
5442			    sizeof(struct ether_vlan_header));
5443
5444		if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
5445		iflib_add_device_sysctl_post(ctx);
5446		ctx->ifc_flags |= IFC_INIT_DONE;
5447		CTX_UNLOCK(ctx);
5448		return (0);
5449	}
5450	ifmedia_add(ctx->ifc_mediap, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL);
5451	ifmedia_add(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO, 0, NULL);
5452	ifmedia_set(ctx->ifc_mediap, IFM_ETHER | IFM_AUTO);
5453
5454	_iflib_pre_assert(scctx);
5455	ctx->ifc_txrx = *scctx->isc_txrx;
5456
5457	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
5458		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
5459	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
5460		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
5461
5462	num_txd = iflib_num_tx_descs(ctx);
5463	num_rxd = iflib_num_rx_descs(ctx);
5464
5465	/* XXX change for per-queue sizes */
5466	device_printf(dev, "Using %d TX descriptors and %d RX descriptors\n",
5467	    num_txd, num_rxd);
5468
5469	if (scctx->isc_tx_nsegments > num_txd / MAX_SINGLE_PACKET_FRACTION)
5470		scctx->isc_tx_nsegments = max(1, num_txd /
5471		    MAX_SINGLE_PACKET_FRACTION);
5472	if (scctx->isc_tx_tso_segments_max > num_txd /
5473	    MAX_SINGLE_PACKET_FRACTION)
5474		scctx->isc_tx_tso_segments_max = max(1,
5475		    num_txd / MAX_SINGLE_PACKET_FRACTION);
5476
5477	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
5478	if (if_getcapabilities(ifp) & IFCAP_TSO) {
5479#ifndef __HAIKU__
5480		/*
5481		 * The stack can't handle a TSO size larger than IP_MAXPACKET,
5482		 * but some MACs do.
5483		 */
5484		if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max,
5485		    IP_MAXPACKET));
5486		/*
5487		 * Take maximum number of m_pullup(9)'s in iflib_parse_header()
5488		 * into account.  In the worst case, each of these calls will
5489		 * add another mbuf and, thus, the requirement for another DMA
5490		 * segment.  So for best performance, it doesn't make sense to
5491		 * advertize a maximum of TSO segments that typically will
5492		 * require defragmentation in iflib_encap().
5493		 */
5494		if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3);
5495		if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max);
5496#endif
5497	}
5498	if (scctx->isc_rss_table_size == 0)
5499		scctx->isc_rss_table_size = 64;
5500	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
5501
5502	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
5503	/* XXX format name */
5504	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx,
5505	    NULL, NULL, "admin");
5506
5507	/* XXX --- can support > 1 -- but keep it simple for now */
5508	scctx->isc_intr = IFLIB_INTR_LEGACY;
5509
5510	/* Get memory for the station queues */
5511	if ((err = iflib_queues_alloc(ctx))) {
5512		device_printf(dev, "Unable to allocate queue memory\n");
5513		goto fail_iflib_detach;
5514	}
5515
5516	if ((err = iflib_qset_structures_setup(ctx))) {
5517		device_printf(dev, "qset structure setup failed %d\n", err);
5518		goto fail_queues;
5519	}
5520
5521	/*
5522	 * XXX What if anything do we want to do about interrupts?
5523	 */
5524	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
5525	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
5526		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
5527		goto fail_detach;
5528	}
5529
5530	/*
5531	 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
5532	 * This must appear after the call to ether_ifattach() because
5533	 * ether_ifattach() sets if_hdrlen to the default value.
5534	 */
5535	if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
5536		if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
5537
5538	/* XXX handle more than one queue */
5539	for (i = 0; i < scctx->isc_nrxqsets; i++)
5540		IFDI_RX_CLSET(ctx, 0, i, ctx->ifc_rxqs[i].ifr_fl[0].ifl_sds.ifsd_cl);
5541
5542	*ctxp = ctx;
5543
5544	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
5545	iflib_add_device_sysctl_post(ctx);
5546	ctx->ifc_flags |= IFC_INIT_DONE;
5547	CTX_UNLOCK(ctx);
5548
5549	return (0);
5550fail_detach:
5551	ether_ifdetach(ctx->ifc_ifp);
5552fail_queues:
5553	iflib_tqg_detach(ctx);
5554	iflib_tx_structures_free(ctx);
5555	iflib_rx_structures_free(ctx);
5556fail_iflib_detach:
5557	IFDI_DETACH(ctx);
5558	IFDI_QUEUES_FREE(ctx);
5559fail_unlock:
5560	CTX_UNLOCK(ctx);
5561	iflib_deregister(ctx);
5562fail_ctx_free:
5563	free(ctx->ifc_softc, M_IFLIB);
5564	free(ctx, M_IFLIB);
5565	return (err);
5566}
5567
5568int
5569iflib_pseudo_deregister(if_ctx_t ctx)
5570{
5571	if_t ifp = ctx->ifc_ifp;
5572	if_shared_ctx_t sctx = ctx->ifc_sctx;
5573
5574	/* Unregister VLAN event handlers early */
5575	iflib_unregister_vlan_handlers(ctx);
5576
5577	if ((sctx->isc_flags & IFLIB_PSEUDO)  &&
5578		(sctx->isc_flags & IFLIB_PSEUDO_ETHER) == 0) {
5579		bpfdetach(ifp);
5580		if_detach(ifp);
5581	} else {
5582		ether_ifdetach(ifp);
5583	}
5584
5585	iflib_tqg_detach(ctx);
5586	iflib_tx_structures_free(ctx);
5587	iflib_rx_structures_free(ctx);
5588	IFDI_DETACH(ctx);
5589	IFDI_QUEUES_FREE(ctx);
5590
5591	iflib_deregister(ctx);
5592
5593	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
5594		free(ctx->ifc_softc, M_IFLIB);
5595	free(ctx, M_IFLIB);
5596	return (0);
5597}
5598
5599int
5600iflib_device_attach(device_t dev)
5601{
5602	if_ctx_t ctx;
5603	if_shared_ctx_t sctx;
5604
5605	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
5606		return (ENOTSUP);
5607
5608	pci_enable_busmaster(dev);
5609
5610	return (iflib_device_register(dev, NULL, sctx, &ctx));
5611}
5612
5613int
5614iflib_device_deregister(if_ctx_t ctx)
5615{
5616	if_t ifp = ctx->ifc_ifp;
5617	device_t dev = ctx->ifc_dev;
5618
5619	/* Make sure VLANS are not using driver */
5620	if (if_vlantrunkinuse(ifp)) {
5621		device_printf(dev, "Vlan in use, detach first\n");
5622		return (EBUSY);
5623	}
5624#ifdef PCI_IOV
5625	if (!CTX_IS_VF(ctx) && pci_iov_detach(dev) != 0) {
5626		device_printf(dev, "SR-IOV in use; detach first.\n");
5627		return (EBUSY);
5628	}
5629#endif
5630
5631	STATE_LOCK(ctx);
5632	ctx->ifc_flags |= IFC_IN_DETACH;
5633	STATE_UNLOCK(ctx);
5634
5635	/* Unregister VLAN handlers before calling iflib_stop() */
5636	iflib_unregister_vlan_handlers(ctx);
5637
5638	iflib_netmap_detach(ifp);
5639	ether_ifdetach(ifp);
5640
5641	CTX_LOCK(ctx);
5642	iflib_stop(ctx);
5643	CTX_UNLOCK(ctx);
5644
5645	iflib_rem_pfil(ctx);
5646	if (ctx->ifc_led_dev != NULL)
5647		led_destroy(ctx->ifc_led_dev);
5648
5649	iflib_tqg_detach(ctx);
5650	iflib_tx_structures_free(ctx);
5651	iflib_rx_structures_free(ctx);
5652
5653	CTX_LOCK(ctx);
5654	IFDI_DETACH(ctx);
5655	IFDI_QUEUES_FREE(ctx);
5656	CTX_UNLOCK(ctx);
5657
5658	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
5659	iflib_free_intr_mem(ctx);
5660
5661	bus_generic_detach(dev);
5662
5663	iflib_deregister(ctx);
5664
5665	device_set_softc(ctx->ifc_dev, NULL);
5666	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
5667		free(ctx->ifc_softc, M_IFLIB);
5668	unref_ctx_core_offset(ctx);
5669	free(ctx, M_IFLIB);
5670	return (0);
5671}
5672
5673static void
5674iflib_tqg_detach(if_ctx_t ctx)
5675{
5676	iflib_txq_t txq;
5677	iflib_rxq_t rxq;
5678	int i;
5679	struct taskqgroup *tqg;
5680
5681	/* XXX drain any dependent tasks */
5682	tqg = qgroup_if_io_tqg;
5683	for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
5684		callout_drain(&txq->ift_timer);
5685#ifdef DEV_NETMAP
5686		callout_drain(&txq->ift_netmap_timer);
5687#endif /* DEV_NETMAP */
5688		if (txq->ift_task.gt_uniq != NULL)
5689			taskqgroup_detach(tqg, &txq->ift_task);
5690	}
5691	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
5692		if (rxq->ifr_task.gt_uniq != NULL)
5693			taskqgroup_detach(tqg, &rxq->ifr_task);
5694	}
5695	tqg = qgroup_if_config_tqg;
5696	if (ctx->ifc_admin_task.gt_uniq != NULL)
5697		taskqgroup_detach(tqg, &ctx->ifc_admin_task);
5698	if (ctx->ifc_vflr_task.gt_uniq != NULL)
5699		taskqgroup_detach(tqg, &ctx->ifc_vflr_task);
5700}
5701
5702static void
5703iflib_free_intr_mem(if_ctx_t ctx)
5704{
5705
5706	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) {
5707		iflib_irq_free(ctx, &ctx->ifc_legacy_irq);
5708	}
5709	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) {
5710		pci_release_msi(ctx->ifc_dev);
5711	}
5712	if (ctx->ifc_msix_mem != NULL) {
5713		bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY,
5714		    rman_get_rid(ctx->ifc_msix_mem), ctx->ifc_msix_mem);
5715		ctx->ifc_msix_mem = NULL;
5716	}
5717}
5718
5719int
5720iflib_device_detach(device_t dev)
5721{
5722	if_ctx_t ctx = device_get_softc(dev);
5723
5724	return (iflib_device_deregister(ctx));
5725}
5726
5727int
5728iflib_device_suspend(device_t dev)
5729{
5730	if_ctx_t ctx = device_get_softc(dev);
5731
5732	CTX_LOCK(ctx);
5733	IFDI_SUSPEND(ctx);
5734	CTX_UNLOCK(ctx);
5735
5736	return bus_generic_suspend(dev);
5737}
5738int
5739iflib_device_shutdown(device_t dev)
5740{
5741	if_ctx_t ctx = device_get_softc(dev);
5742
5743	CTX_LOCK(ctx);
5744	IFDI_SHUTDOWN(ctx);
5745	CTX_UNLOCK(ctx);
5746
5747	return bus_generic_suspend(dev);
5748}
5749
5750int
5751iflib_device_resume(device_t dev)
5752{
5753	if_ctx_t ctx = device_get_softc(dev);
5754	iflib_txq_t txq = ctx->ifc_txqs;
5755
5756	CTX_LOCK(ctx);
5757	IFDI_RESUME(ctx);
5758	iflib_if_init_locked(ctx);
5759	CTX_UNLOCK(ctx);
5760	for (int i = 0; i < NTXQSETS(ctx); i++, txq++)
5761		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
5762
5763	return (bus_generic_resume(dev));
5764}
5765
5766int
5767iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params)
5768{
5769	int error;
5770	if_ctx_t ctx = device_get_softc(dev);
5771
5772	CTX_LOCK(ctx);
5773	error = IFDI_IOV_INIT(ctx, num_vfs, params);
5774	CTX_UNLOCK(ctx);
5775
5776	return (error);
5777}
5778
5779void
5780iflib_device_iov_uninit(device_t dev)
5781{
5782	if_ctx_t ctx = device_get_softc(dev);
5783
5784	CTX_LOCK(ctx);
5785	IFDI_IOV_UNINIT(ctx);
5786	CTX_UNLOCK(ctx);
5787}
5788
5789int
5790iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params)
5791{
5792	int error;
5793	if_ctx_t ctx = device_get_softc(dev);
5794
5795	CTX_LOCK(ctx);
5796	error = IFDI_IOV_VF_ADD(ctx, vfnum, params);
5797	CTX_UNLOCK(ctx);
5798
5799	return (error);
5800}
5801
5802/*********************************************************************
5803 *
5804 *  MODULE FUNCTION DEFINITIONS
5805 *
5806 **********************************************************************/
5807
5808/*
5809 * - Start a fast taskqueue thread for each core
5810 * - Start a taskqueue for control operations
5811 */
5812static int
5813iflib_module_init(void)
5814{
5815	iflib_timer_default = hz / 2;
5816	return (0);
5817}
5818
5819static int
5820iflib_module_event_handler(module_t mod, int what, void *arg)
5821{
5822	int err;
5823
5824	switch (what) {
5825	case MOD_LOAD:
5826		if ((err = iflib_module_init()) != 0)
5827			return (err);
5828		break;
5829	case MOD_UNLOAD:
5830		return (EBUSY);
5831	default:
5832		return (EOPNOTSUPP);
5833	}
5834
5835	return (0);
5836}
5837
5838/*********************************************************************
5839 *
5840 *  PUBLIC FUNCTION DEFINITIONS
5841 *     ordered as in iflib.h
5842 *
5843 **********************************************************************/
5844
5845static void
5846_iflib_assert(if_shared_ctx_t sctx)
5847{
5848	int i;
5849
5850	MPASS(sctx->isc_tx_maxsize);
5851	MPASS(sctx->isc_tx_maxsegsize);
5852
5853	MPASS(sctx->isc_rx_maxsize);
5854	MPASS(sctx->isc_rx_nsegments);
5855	MPASS(sctx->isc_rx_maxsegsize);
5856
5857	MPASS(sctx->isc_nrxqs >= 1 && sctx->isc_nrxqs <= 8);
5858	for (i = 0; i < sctx->isc_nrxqs; i++) {
5859		MPASS(sctx->isc_nrxd_min[i]);
5860		MPASS(powerof2(sctx->isc_nrxd_min[i]));
5861		MPASS(sctx->isc_nrxd_max[i]);
5862		MPASS(powerof2(sctx->isc_nrxd_max[i]));
5863		MPASS(sctx->isc_nrxd_default[i]);
5864		MPASS(powerof2(sctx->isc_nrxd_default[i]));
5865	}
5866
5867	MPASS(sctx->isc_ntxqs >= 1 && sctx->isc_ntxqs <= 8);
5868	for (i = 0; i < sctx->isc_ntxqs; i++) {
5869		MPASS(sctx->isc_ntxd_min[i]);
5870		MPASS(powerof2(sctx->isc_ntxd_min[i]));
5871		MPASS(sctx->isc_ntxd_max[i]);
5872		MPASS(powerof2(sctx->isc_ntxd_max[i]));
5873		MPASS(sctx->isc_ntxd_default[i]);
5874		MPASS(powerof2(sctx->isc_ntxd_default[i]));
5875	}
5876}
5877
5878static void
5879_iflib_pre_assert(if_softc_ctx_t scctx)
5880{
5881
5882	MPASS(scctx->isc_txrx->ift_txd_encap);
5883	MPASS(scctx->isc_txrx->ift_txd_flush);
5884	MPASS(scctx->isc_txrx->ift_txd_credits_update);
5885	MPASS(scctx->isc_txrx->ift_rxd_available);
5886	MPASS(scctx->isc_txrx->ift_rxd_pkt_get);
5887	MPASS(scctx->isc_txrx->ift_rxd_refill);
5888	MPASS(scctx->isc_txrx->ift_rxd_flush);
5889}
5890
5891static int
5892iflib_register(if_ctx_t ctx)
5893{
5894	if_shared_ctx_t sctx = ctx->ifc_sctx;
5895	driver_t *driver = sctx->isc_driver;
5896	device_t dev = ctx->ifc_dev;
5897	if_t ifp;
5898	u_char type;
5899	int iflags;
5900
5901	if ((sctx->isc_flags & IFLIB_PSEUDO) == 0)
5902		_iflib_assert(sctx);
5903
5904	CTX_LOCK_INIT(ctx);
5905	STATE_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev));
5906	if (sctx->isc_flags & IFLIB_PSEUDO) {
5907		if (sctx->isc_flags & IFLIB_PSEUDO_ETHER)
5908			type = IFT_ETHER;
5909		else
5910			type = IFT_PPP;
5911	} else
5912		type = IFT_ETHER;
5913	ifp = ctx->ifc_ifp = if_alloc(type);
5914	if (ifp == NULL) {
5915		device_printf(dev, "can not allocate ifnet structure\n");
5916		return (ENOMEM);
5917	}
5918
5919	/*
5920	 * Initialize our context's device specific methods
5921	 */
5922	kobj_init((kobj_t) ctx, (kobj_class_t) driver);
5923	kobj_class_compile((kobj_class_t) driver);
5924
5925	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
5926	if_setsoftc(ifp, ctx);
5927	if_setdev(ifp, dev);
5928	if_setinitfn(ifp, iflib_if_init);
5929	if_setioctlfn(ifp, iflib_if_ioctl);
5930#ifdef ALTQ
5931	if_setstartfn(ifp, iflib_altq_if_start);
5932	if_settransmitfn(ifp, iflib_altq_if_transmit);
5933	if_setsendqready(ifp);
5934#else
5935	if_settransmitfn(ifp, iflib_if_transmit);
5936#endif
5937	if_setqflushfn(ifp, iflib_if_qflush);
5938#ifndef __HAIKU__
5939	iflags = IFF_MULTICAST | IFF_KNOWSEPOCH;
5940#else
5941	iflags = IFF_MULTICAST;
5942#endif
5943
5944	if ((sctx->isc_flags & IFLIB_PSEUDO) &&
5945		(sctx->isc_flags & IFLIB_PSEUDO_ETHER) == 0)
5946		iflags |= IFF_POINTOPOINT;
5947	else
5948		iflags |= IFF_BROADCAST | IFF_SIMPLEX;
5949	if_setflags(ifp, iflags);
5950	ctx->ifc_vlan_attach_event =
5951		EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx,
5952							  EVENTHANDLER_PRI_FIRST);
5953	ctx->ifc_vlan_detach_event =
5954		EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx,
5955							  EVENTHANDLER_PRI_FIRST);
5956
5957	if ((sctx->isc_flags & IFLIB_DRIVER_MEDIA) == 0) {
5958		ctx->ifc_mediap = &ctx->ifc_media;
5959		ifmedia_init(ctx->ifc_mediap, IFM_IMASK,
5960		    iflib_media_change, iflib_media_status);
5961	}
5962	return (0);
5963}
5964
5965static void
5966iflib_unregister_vlan_handlers(if_ctx_t ctx)
5967{
5968	/* Unregister VLAN events */
5969	if (ctx->ifc_vlan_attach_event != NULL) {
5970		EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event);
5971		ctx->ifc_vlan_attach_event = NULL;
5972	}
5973	if (ctx->ifc_vlan_detach_event != NULL) {
5974		EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event);
5975		ctx->ifc_vlan_detach_event = NULL;
5976	}
5977
5978}
5979
5980static void
5981iflib_deregister(if_ctx_t ctx)
5982{
5983	if_t ifp = ctx->ifc_ifp;
5984
5985	/* Remove all media */
5986	ifmedia_removeall(&ctx->ifc_media);
5987
5988	/* Ensure that VLAN event handlers are unregistered */
5989	iflib_unregister_vlan_handlers(ctx);
5990
5991#ifndef __HAIKU__
5992	/* Release kobject reference */
5993	kobj_delete((kobj_t) ctx, NULL);
5994#endif
5995
5996	/* Free the ifnet structure */
5997	if_free(ifp);
5998
5999	STATE_LOCK_DESTROY(ctx);
6000
6001	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
6002	CTX_LOCK_DESTROY(ctx);
6003}
6004
6005static int
6006iflib_queues_alloc(if_ctx_t ctx)
6007{
6008	if_shared_ctx_t sctx = ctx->ifc_sctx;
6009	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
6010	device_t dev = ctx->ifc_dev;
6011	int nrxqsets = scctx->isc_nrxqsets;
6012	int ntxqsets = scctx->isc_ntxqsets;
6013	iflib_txq_t txq;
6014	iflib_rxq_t rxq;
6015	iflib_fl_t fl = NULL;
6016	int i, j, cpu, err, txconf, rxconf;
6017	iflib_dma_info_t ifdip;
6018	uint32_t *rxqsizes = scctx->isc_rxqsizes;
6019	uint32_t *txqsizes = scctx->isc_txqsizes;
6020	uint8_t nrxqs = sctx->isc_nrxqs;
6021	uint8_t ntxqs = sctx->isc_ntxqs;
6022	int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1;
6023	int fl_offset = (sctx->isc_flags & IFLIB_HAS_RXCQ ? 1 : 0);
6024	caddr_t *vaddrs;
6025	uint64_t *paddrs;
6026
6027	KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1"));
6028	KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1"));
6029	KASSERT(nrxqs >= fl_offset + nfree_lists,
6030           ("there must be at least a rxq for each free list"));
6031
6032	/* Allocate the TX ring struct memory */
6033	if (!(ctx->ifc_txqs =
6034	    (iflib_txq_t) malloc(sizeof(struct iflib_txq) *
6035	    ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
6036		device_printf(dev, "Unable to allocate TX ring memory\n");
6037		err = ENOMEM;
6038		goto fail;
6039	}
6040
6041	/* Now allocate the RX */
6042	if (!(ctx->ifc_rxqs =
6043	    (iflib_rxq_t) malloc(sizeof(struct iflib_rxq) *
6044	    nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
6045		device_printf(dev, "Unable to allocate RX ring memory\n");
6046		err = ENOMEM;
6047		goto rx_fail;
6048	}
6049
6050	txq = ctx->ifc_txqs;
6051	rxq = ctx->ifc_rxqs;
6052
6053	/*
6054	 * XXX handle allocation failure
6055	 */
6056	for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) {
6057		/* Set up some basics */
6058
6059		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs,
6060		    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
6061			device_printf(dev,
6062			    "Unable to allocate TX DMA info memory\n");
6063			err = ENOMEM;
6064			goto err_tx_desc;
6065		}
6066		txq->ift_ifdi = ifdip;
6067		for (j = 0; j < ntxqs; j++, ifdip++) {
6068			if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, 0)) {
6069				device_printf(dev,
6070				    "Unable to allocate TX descriptors\n");
6071				err = ENOMEM;
6072				goto err_tx_desc;
6073			}
6074			txq->ift_txd_size[j] = scctx->isc_txd_size[j];
6075			bzero((void *)ifdip->idi_vaddr, txqsizes[j]);
6076		}
6077		txq->ift_ctx = ctx;
6078		txq->ift_id = i;
6079		if (sctx->isc_flags & IFLIB_HAS_TXCQ) {
6080			txq->ift_br_offset = 1;
6081		} else {
6082			txq->ift_br_offset = 0;
6083		}
6084
6085		if (iflib_txsd_alloc(txq)) {
6086			device_printf(dev, "Critical Failure setting up TX buffers\n");
6087			err = ENOMEM;
6088			goto err_tx_desc;
6089		}
6090
6091		/* Initialize the TX lock */
6092		snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:TX(%d):callout",
6093		    device_get_nameunit(dev), txq->ift_id);
6094		mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF);
6095		callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0);
6096#ifndef __HAIKU__
6097		txq->ift_timer.c_cpu = cpu;
6098#endif
6099#ifdef DEV_NETMAP
6100		callout_init_mtx(&txq->ift_netmap_timer, &txq->ift_mtx, 0);
6101		txq->ift_netmap_timer.c_cpu = cpu;
6102#endif /* DEV_NETMAP */
6103
6104		err = ifmp_ring_alloc(&txq->ift_br, 2048, txq, iflib_txq_drain,
6105				      iflib_txq_can_drain, M_IFLIB, M_WAITOK);
6106		if (err) {
6107			/* XXX free any allocated rings */
6108			device_printf(dev, "Unable to allocate buf_ring\n");
6109			goto err_tx_desc;
6110		}
6111	}
6112
6113	for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) {
6114		/* Set up some basics */
6115		callout_init(&rxq->ifr_watchdog, 1);
6116
6117		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs,
6118		   M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
6119			device_printf(dev,
6120			    "Unable to allocate RX DMA info memory\n");
6121			err = ENOMEM;
6122			goto err_tx_desc;
6123		}
6124
6125		rxq->ifr_ifdi = ifdip;
6126		/* XXX this needs to be changed if #rx queues != #tx queues */
6127		rxq->ifr_ntxqirq = 1;
6128		rxq->ifr_txqid[0] = i;
6129		for (j = 0; j < nrxqs; j++, ifdip++) {
6130			if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, 0)) {
6131				device_printf(dev,
6132				    "Unable to allocate RX descriptors\n");
6133				err = ENOMEM;
6134				goto err_tx_desc;
6135			}
6136			bzero((void *)ifdip->idi_vaddr, rxqsizes[j]);
6137		}
6138		rxq->ifr_ctx = ctx;
6139		rxq->ifr_id = i;
6140		rxq->ifr_fl_offset = fl_offset;
6141		rxq->ifr_nfl = nfree_lists;
6142		if (!(fl =
6143			  (iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) {
6144			device_printf(dev, "Unable to allocate free list memory\n");
6145			err = ENOMEM;
6146			goto err_tx_desc;
6147		}
6148		rxq->ifr_fl = fl;
6149		for (j = 0; j < nfree_lists; j++) {
6150			fl[j].ifl_rxq = rxq;
6151			fl[j].ifl_id = j;
6152			fl[j].ifl_ifdi = &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
6153			fl[j].ifl_rxd_size = scctx->isc_rxd_size[j];
6154		}
6155		/* Allocate receive buffers for the ring */
6156		if (iflib_rxsd_alloc(rxq)) {
6157			device_printf(dev,
6158			    "Critical Failure setting up receive buffers\n");
6159			err = ENOMEM;
6160			goto err_rx_desc;
6161		}
6162
6163		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
6164			fl->ifl_rx_bitmap = bit_alloc(fl->ifl_size, M_IFLIB,
6165			    M_WAITOK);
6166	}
6167
6168	/* TXQs */
6169	vaddrs = malloc(sizeof(caddr_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
6170	paddrs = malloc(sizeof(uint64_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
6171	for (i = 0; i < ntxqsets; i++) {
6172		iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi;
6173
6174		for (j = 0; j < ntxqs; j++, di++) {
6175			vaddrs[i*ntxqs + j] = di->idi_vaddr;
6176			paddrs[i*ntxqs + j] = di->idi_paddr;
6177		}
6178	}
6179	if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) {
6180		device_printf(ctx->ifc_dev,
6181		    "Unable to allocate device TX queue\n");
6182		iflib_tx_structures_free(ctx);
6183		free(vaddrs, M_IFLIB);
6184		free(paddrs, M_IFLIB);
6185		goto err_rx_desc;
6186	}
6187	free(vaddrs, M_IFLIB);
6188	free(paddrs, M_IFLIB);
6189
6190	/* RXQs */
6191	vaddrs = malloc(sizeof(caddr_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
6192	paddrs = malloc(sizeof(uint64_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
6193	for (i = 0; i < nrxqsets; i++) {
6194		iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi;
6195
6196		for (j = 0; j < nrxqs; j++, di++) {
6197			vaddrs[i*nrxqs + j] = di->idi_vaddr;
6198			paddrs[i*nrxqs + j] = di->idi_paddr;
6199		}
6200	}
6201	if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) {
6202		device_printf(ctx->ifc_dev,
6203		    "Unable to allocate device RX queue\n");
6204		iflib_tx_structures_free(ctx);
6205		free(vaddrs, M_IFLIB);
6206		free(paddrs, M_IFLIB);
6207		goto err_rx_desc;
6208	}
6209	free(vaddrs, M_IFLIB);
6210	free(paddrs, M_IFLIB);
6211
6212	return (0);
6213
6214/* XXX handle allocation failure changes */
6215err_rx_desc:
6216err_tx_desc:
6217rx_fail:
6218	if (ctx->ifc_rxqs != NULL)
6219		free(ctx->ifc_rxqs, M_IFLIB);
6220	ctx->ifc_rxqs = NULL;
6221	if (ctx->ifc_txqs != NULL)
6222		free(ctx->ifc_txqs, M_IFLIB);
6223	ctx->ifc_txqs = NULL;
6224fail:
6225	return (err);
6226}
6227
6228static int
6229iflib_tx_structures_setup(if_ctx_t ctx)
6230{
6231	iflib_txq_t txq = ctx->ifc_txqs;
6232	int i;
6233
6234	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
6235		iflib_txq_setup(txq);
6236
6237	return (0);
6238}
6239
6240static void
6241iflib_tx_structures_free(if_ctx_t ctx)
6242{
6243	iflib_txq_t txq = ctx->ifc_txqs;
6244	if_shared_ctx_t sctx = ctx->ifc_sctx;
6245	int i, j;
6246
6247	for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
6248		for (j = 0; j < sctx->isc_ntxqs; j++)
6249			iflib_dma_free(&txq->ift_ifdi[j]);
6250		iflib_txq_destroy(txq);
6251	}
6252	free(ctx->ifc_txqs, M_IFLIB);
6253	ctx->ifc_txqs = NULL;
6254}
6255
6256/*********************************************************************
6257 *
6258 *  Initialize all receive rings.
6259 *
6260 **********************************************************************/
6261static int
6262iflib_rx_structures_setup(if_ctx_t ctx)
6263{
6264	iflib_rxq_t rxq = ctx->ifc_rxqs;
6265	int q;
6266#if defined(INET6) || defined(INET)
6267	int err, i;
6268#endif
6269
6270	for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) {
6271#if defined(INET6) || defined(INET)
6272		if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_LRO) {
6273			err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp,
6274			    TCP_LRO_ENTRIES, min(1024,
6275			    ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset]));
6276			if (err != 0) {
6277				device_printf(ctx->ifc_dev,
6278				    "LRO Initialization failed!\n");
6279				goto fail;
6280			}
6281		}
6282#endif
6283		IFDI_RXQ_SETUP(ctx, rxq->ifr_id);
6284	}
6285	return (0);
6286#if defined(INET6) || defined(INET)
6287fail:
6288	/*
6289	 * Free LRO resources allocated so far, we will only handle
6290	 * the rings that completed, the failing case will have
6291	 * cleaned up for itself.  'q' failed, so its the terminus.
6292	 */
6293	rxq = ctx->ifc_rxqs;
6294	for (i = 0; i < q; ++i, rxq++) {
6295		if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_LRO)
6296			tcp_lro_free(&rxq->ifr_lc);
6297	}
6298	return (err);
6299#endif
6300}
6301
6302/*********************************************************************
6303 *
6304 *  Free all receive rings.
6305 *
6306 **********************************************************************/
6307static void
6308iflib_rx_structures_free(if_ctx_t ctx)
6309{
6310	iflib_rxq_t rxq = ctx->ifc_rxqs;
6311	if_shared_ctx_t sctx = ctx->ifc_sctx;
6312	int i, j;
6313
6314	for (i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) {
6315		for (j = 0; j < sctx->isc_nrxqs; j++)
6316			iflib_dma_free(&rxq->ifr_ifdi[j]);
6317		iflib_rx_sds_free(rxq);
6318#if defined(INET6) || defined(INET)
6319		if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_LRO)
6320			tcp_lro_free(&rxq->ifr_lc);
6321#endif
6322	}
6323	free(ctx->ifc_rxqs, M_IFLIB);
6324	ctx->ifc_rxqs = NULL;
6325}
6326
6327static int
6328iflib_qset_structures_setup(if_ctx_t ctx)
6329{
6330	int err;
6331
6332	/*
6333	 * It is expected that the caller takes care of freeing queues if this
6334	 * fails.
6335	 */
6336	if ((err = iflib_tx_structures_setup(ctx)) != 0) {
6337		device_printf(ctx->ifc_dev, "iflib_tx_structures_setup failed: %d\n", err);
6338		return (err);
6339	}
6340
6341	if ((err = iflib_rx_structures_setup(ctx)) != 0)
6342		device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err);
6343
6344	return (err);
6345}
6346
6347int
6348iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
6349		driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, const char *name)
6350{
6351
6352	return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
6353}
6354
6355/* Just to avoid copy/paste */
6356static inline int
6357iflib_irq_set_affinity(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,
6358    int qid, struct grouptask *gtask, struct taskqgroup *tqg, void *uniq,
6359    const char *name)
6360{
6361	device_t dev;
6362	unsigned int base_cpuid, cpuid;
6363	int err;
6364
6365	dev = ctx->ifc_dev;
6366	base_cpuid = ctx->ifc_sysctl_core_offset;
6367	cpuid = get_cpuid_for_queue(ctx, base_cpuid, qid, type == IFLIB_INTR_TX);
6368	err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, dev,
6369	    irq ? irq->ii_res : NULL, name);
6370	if (err) {
6371		device_printf(dev, "taskqgroup_attach_cpu failed %d\n", err);
6372		return (err);
6373	}
6374#ifdef notyet
6375	if (cpuid > ctx->ifc_cpuid_highest)
6376		ctx->ifc_cpuid_highest = cpuid;
6377#endif
6378	return (0);
6379}
6380
6381int
6382iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
6383			iflib_intr_type_t type, driver_filter_t *filter,
6384			void *filter_arg, int qid, const char *name)
6385{
6386	device_t dev;
6387	struct grouptask *gtask;
6388	struct taskqgroup *tqg;
6389	iflib_filter_info_t info;
6390	gtask_fn_t *fn;
6391	int tqrid, err;
6392	driver_filter_t *intr_fast;
6393	void *q;
6394
6395	info = &ctx->ifc_filter_info;
6396	tqrid = rid;
6397
6398	switch (type) {
6399	/* XXX merge tx/rx for netmap? */
6400	case IFLIB_INTR_TX:
6401		q = &ctx->ifc_txqs[qid];
6402		info = &ctx->ifc_txqs[qid].ift_filter_info;
6403		gtask = &ctx->ifc_txqs[qid].ift_task;
6404		tqg = qgroup_if_io_tqg;
6405		fn = _task_fn_tx;
6406		intr_fast = iflib_fast_intr;
6407		GROUPTASK_INIT(gtask, 0, fn, q);
6408		ctx->ifc_flags |= IFC_NETMAP_TX_IRQ;
6409		break;
6410	case IFLIB_INTR_RX:
6411		q = &ctx->ifc_rxqs[qid];
6412		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
6413		gtask = &ctx->ifc_rxqs[qid].ifr_task;
6414		tqg = qgroup_if_io_tqg;
6415		fn = _task_fn_rx;
6416		intr_fast = iflib_fast_intr;
6417		NET_GROUPTASK_INIT(gtask, 0, fn, q);
6418		break;
6419	case IFLIB_INTR_RXTX:
6420		q = &ctx->ifc_rxqs[qid];
6421		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
6422		gtask = &ctx->ifc_rxqs[qid].ifr_task;
6423		tqg = qgroup_if_io_tqg;
6424		fn = _task_fn_rx;
6425		intr_fast = iflib_fast_intr_rxtx;
6426		NET_GROUPTASK_INIT(gtask, 0, fn, q);
6427		break;
6428	case IFLIB_INTR_ADMIN:
6429		q = ctx;
6430		tqrid = -1;
6431		info = &ctx->ifc_filter_info;
6432		gtask = &ctx->ifc_admin_task;
6433		tqg = qgroup_if_config_tqg;
6434		fn = _task_fn_admin;
6435		intr_fast = iflib_fast_intr_ctx;
6436		break;
6437	default:
6438		device_printf(ctx->ifc_dev, "%s: unknown net intr type\n",
6439		    __func__);
6440		return (EINVAL);
6441	}
6442
6443	info->ifi_filter = filter;
6444	info->ifi_filter_arg = filter_arg;
6445	info->ifi_task = gtask;
6446	info->ifi_ctx = q;
6447
6448	dev = ctx->ifc_dev;
6449	err = _iflib_irq_alloc(ctx, irq, rid, intr_fast, NULL, info,  name);
6450	if (err != 0) {
6451		device_printf(dev, "_iflib_irq_alloc failed %d\n", err);
6452		return (err);
6453	}
6454	if (type == IFLIB_INTR_ADMIN)
6455		return (0);
6456
6457	if (tqrid != -1) {
6458		err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q,
6459		    name);
6460		if (err)
6461			return (err);
6462	} else {
6463		taskqgroup_attach(tqg, gtask, q, dev, irq->ii_res, name);
6464	}
6465
6466	return (0);
6467}
6468
6469void
6470iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type, void *arg, int qid, const char *name)
6471{
6472	device_t dev;
6473	struct grouptask *gtask;
6474	struct taskqgroup *tqg;
6475	gtask_fn_t *fn;
6476	void *q;
6477	int err;
6478
6479	switch (type) {
6480	case IFLIB_INTR_TX:
6481		q = &ctx->ifc_txqs[qid];
6482		gtask = &ctx->ifc_txqs[qid].ift_task;
6483		tqg = qgroup_if_io_tqg;
6484		fn = _task_fn_tx;
6485		GROUPTASK_INIT(gtask, 0, fn, q);
6486		break;
6487	case IFLIB_INTR_RX:
6488		q = &ctx->ifc_rxqs[qid];
6489		gtask = &ctx->ifc_rxqs[qid].ifr_task;
6490		tqg = qgroup_if_io_tqg;
6491		fn = _task_fn_rx;
6492		NET_GROUPTASK_INIT(gtask, 0, fn, q);
6493		break;
6494	case IFLIB_INTR_IOV:
6495		q = ctx;
6496		gtask = &ctx->ifc_vflr_task;
6497		tqg = qgroup_if_config_tqg;
6498		fn = _task_fn_iov;
6499		GROUPTASK_INIT(gtask, 0, fn, q);
6500		break;
6501	default:
6502		panic("unknown net intr type");
6503	}
6504	err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q, name);
6505	if (err) {
6506		dev = ctx->ifc_dev;
6507		taskqgroup_attach(tqg, gtask, q, dev, irq ? irq->ii_res : NULL,
6508		    name);
6509	}
6510}
6511
6512void
6513iflib_irq_free(if_ctx_t ctx, if_irq_t irq)
6514{
6515#ifdef __HAIKU__
6516	if (!ctx || !irq)
6517		return;
6518#endif
6519
6520	if (irq->ii_tag)
6521		bus_teardown_intr(ctx->ifc_dev, irq->ii_res, irq->ii_tag);
6522
6523	if (irq->ii_res)
6524		bus_release_resource(ctx->ifc_dev, SYS_RES_IRQ,
6525		    rman_get_rid(irq->ii_res), irq->ii_res);
6526}
6527
6528static int
6529iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filter_arg, int *rid, const char *name)
6530{
6531	iflib_txq_t txq = ctx->ifc_txqs;
6532	iflib_rxq_t rxq = ctx->ifc_rxqs;
6533	if_irq_t irq = &ctx->ifc_legacy_irq;
6534	iflib_filter_info_t info;
6535	device_t dev;
6536	struct grouptask *gtask;
6537	struct resource *res;
6538	struct taskqgroup *tqg;
6539	void *q;
6540	int err, tqrid;
6541	bool rx_only;
6542
6543	q = &ctx->ifc_rxqs[0];
6544	info = &rxq[0].ifr_filter_info;
6545	gtask = &rxq[0].ifr_task;
6546	tqg = qgroup_if_io_tqg;
6547	tqrid = *rid;
6548	rx_only = (ctx->ifc_sctx->isc_flags & IFLIB_SINGLE_IRQ_RX_ONLY) != 0;
6549
6550	ctx->ifc_flags |= IFC_LEGACY;
6551	info->ifi_filter = filter;
6552	info->ifi_filter_arg = filter_arg;
6553	info->ifi_task = gtask;
6554	info->ifi_ctx = rx_only ? ctx : q;
6555
6556	dev = ctx->ifc_dev;
6557	/* We allocate a single interrupt resource */
6558	err = _iflib_irq_alloc(ctx, irq, tqrid, rx_only ? iflib_fast_intr_ctx :
6559	    iflib_fast_intr_rxtx, NULL, info, name);
6560	if (err != 0)
6561		return (err);
6562	NET_GROUPTASK_INIT(gtask, 0, _task_fn_rx, q);
6563	res = irq->ii_res;
6564	taskqgroup_attach(tqg, gtask, q, dev, res, name);
6565
6566	GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq);
6567	taskqgroup_attach(qgroup_if_io_tqg, &txq->ift_task, txq, dev, res,
6568	    "tx");
6569	return (0);
6570}
6571
6572void
6573iflib_led_create(if_ctx_t ctx)
6574{
6575
6576	ctx->ifc_led_dev = led_create(iflib_led_func, ctx,
6577	    device_get_nameunit(ctx->ifc_dev));
6578}
6579
6580void
6581iflib_tx_intr_deferred(if_ctx_t ctx, int txqid)
6582{
6583
6584	GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task);
6585}
6586
6587void
6588iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid)
6589{
6590
6591	GROUPTASK_ENQUEUE(&ctx->ifc_rxqs[rxqid].ifr_task);
6592}
6593
6594void
6595iflib_admin_intr_deferred(if_ctx_t ctx)
6596{
6597
6598	MPASS(ctx->ifc_admin_task.gt_taskqueue != NULL);
6599	GROUPTASK_ENQUEUE(&ctx->ifc_admin_task);
6600}
6601
6602void
6603iflib_iov_intr_deferred(if_ctx_t ctx)
6604{
6605
6606	GROUPTASK_ENQUEUE(&ctx->ifc_vflr_task);
6607}
6608
6609void
6610iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, const char *name)
6611{
6612
6613	taskqgroup_attach_cpu(qgroup_if_io_tqg, gt, uniq, cpu, NULL, NULL,
6614	    name);
6615}
6616
6617void
6618iflib_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn,
6619	const char *name)
6620{
6621
6622	GROUPTASK_INIT(gtask, 0, fn, ctx);
6623	taskqgroup_attach(qgroup_if_config_tqg, gtask, gtask, NULL, NULL,
6624	    name);
6625}
6626
6627void
6628iflib_config_gtask_deinit(struct grouptask *gtask)
6629{
6630
6631	taskqgroup_detach(qgroup_if_config_tqg, gtask);
6632}
6633
6634void
6635iflib_link_state_change(if_ctx_t ctx, int link_state, uint64_t baudrate)
6636{
6637	if_t ifp = ctx->ifc_ifp;
6638	iflib_txq_t txq = ctx->ifc_txqs;
6639
6640	if_setbaudrate(ifp, baudrate);
6641	if (baudrate >= IF_Gbps(10)) {
6642		STATE_LOCK(ctx);
6643		ctx->ifc_flags |= IFC_PREFETCH;
6644		STATE_UNLOCK(ctx);
6645	}
6646	/* If link down, disable watchdog */
6647	if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) {
6648		for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxqsets; i++, txq++)
6649			txq->ift_qstatus = IFLIB_QUEUE_IDLE;
6650	}
6651	ctx->ifc_link_state = link_state;
6652	if_link_state_change(ifp, link_state);
6653}
6654
6655static int
6656iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq)
6657{
6658	int credits;
6659#ifdef INVARIANTS
6660	int credits_pre = txq->ift_cidx_processed;
6661#endif
6662
6663	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
6664	    BUS_DMASYNC_POSTREAD);
6665	if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, true)) == 0)
6666		return (0);
6667
6668	txq->ift_processed += credits;
6669	txq->ift_cidx_processed += credits;
6670
6671	MPASS(credits_pre + credits == txq->ift_cidx_processed);
6672	if (txq->ift_cidx_processed >= txq->ift_size)
6673		txq->ift_cidx_processed -= txq->ift_size;
6674	return (credits);
6675}
6676
6677static int
6678iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget)
6679{
6680	iflib_fl_t fl;
6681	u_int i;
6682
6683	for (i = 0, fl = &rxq->ifr_fl[0]; i < rxq->ifr_nfl; i++, fl++)
6684		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
6685		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
6686	return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx,
6687	    budget));
6688}
6689
6690void
6691iflib_add_int_delay_sysctl(if_ctx_t ctx, const char *name,
6692	const char *description, if_int_delay_info_t info,
6693	int offset, int value)
6694{
6695	info->iidi_ctx = ctx;
6696	info->iidi_offset = offset;
6697	info->iidi_value = value;
6698	SYSCTL_ADD_PROC(device_get_sysctl_ctx(ctx->ifc_dev),
6699	    SYSCTL_CHILDREN(device_get_sysctl_tree(ctx->ifc_dev)),
6700	    OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
6701	    info, 0, iflib_sysctl_int_delay, "I", description);
6702}
6703
6704struct sx *
6705iflib_ctx_lock_get(if_ctx_t ctx)
6706{
6707
6708	return (&ctx->ifc_ctx_sx);
6709}
6710
6711static int
6712iflib_msix_init(if_ctx_t ctx)
6713{
6714	device_t dev = ctx->ifc_dev;
6715	if_shared_ctx_t sctx = ctx->ifc_sctx;
6716	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
6717	int admincnt, bar, err, iflib_num_rx_queues, iflib_num_tx_queues;
6718	int msgs, queuemsgs, queues, rx_queues, tx_queues, vectors;
6719
6720	iflib_num_tx_queues = ctx->ifc_sysctl_ntxqs;
6721	iflib_num_rx_queues = ctx->ifc_sysctl_nrxqs;
6722
6723	if (bootverbose)
6724		device_printf(dev, "msix_init qsets capped at %d\n",
6725		    imax(scctx->isc_ntxqsets, scctx->isc_nrxqsets));
6726
6727	/* Override by tuneable */
6728	if (scctx->isc_disable_msix)
6729		goto msi;
6730
6731	/* First try MSI-X */
6732	if ((msgs = pci_msix_count(dev)) == 0) {
6733		if (bootverbose)
6734			device_printf(dev, "MSI-X not supported or disabled\n");
6735		goto msi;
6736	}
6737
6738	bar = ctx->ifc_softc_ctx.isc_msix_bar;
6739	/*
6740	 * bar == -1 => "trust me I know what I'm doing"
6741	 * Some drivers are for hardware that is so shoddily
6742	 * documented that no one knows which bars are which
6743	 * so the developer has to map all bars. This hack
6744	 * allows shoddy garbage to use MSI-X in this framework.
6745	 */
6746	if (bar != -1) {
6747		ctx->ifc_msix_mem = bus_alloc_resource_any(dev,
6748	            SYS_RES_MEMORY, &bar, RF_ACTIVE);
6749		if (ctx->ifc_msix_mem == NULL) {
6750			device_printf(dev, "Unable to map MSI-X table\n");
6751			goto msi;
6752		}
6753	}
6754
6755	admincnt = sctx->isc_admin_intrcnt;
6756#if IFLIB_DEBUG
6757	/* use only 1 qset in debug mode */
6758	queuemsgs = min(msgs - admincnt, 1);
6759#else
6760	queuemsgs = msgs - admincnt;
6761#endif
6762#ifdef RSS
6763	queues = imin(queuemsgs, rss_getnumbuckets());
6764#else
6765	queues = queuemsgs;
6766#endif
6767#ifndef __HAIKU__
6768	queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues);
6769	if (bootverbose)
6770		device_printf(dev,
6771		    "intr CPUs: %d queue msgs: %d admincnt: %d\n",
6772		    CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt);
6773#endif
6774#ifdef  RSS
6775	/* If we're doing RSS, clamp at the number of RSS buckets */
6776	if (queues > rss_getnumbuckets())
6777		queues = rss_getnumbuckets();
6778#endif
6779	if (iflib_num_rx_queues > 0 && iflib_num_rx_queues < queuemsgs - admincnt)
6780		rx_queues = iflib_num_rx_queues;
6781	else
6782		rx_queues = queues;
6783
6784	if (rx_queues > scctx->isc_nrxqsets)
6785		rx_queues = scctx->isc_nrxqsets;
6786
6787	/*
6788	 * We want this to be all logical CPUs by default
6789	 */
6790	if (iflib_num_tx_queues > 0 && iflib_num_tx_queues < queues)
6791		tx_queues = iflib_num_tx_queues;
6792	else
6793		tx_queues = mp_ncpus;
6794
6795	if (tx_queues > scctx->isc_ntxqsets)
6796		tx_queues = scctx->isc_ntxqsets;
6797
6798	if (ctx->ifc_sysctl_qs_eq_override == 0) {
6799#ifdef INVARIANTS
6800		if (tx_queues != rx_queues)
6801			device_printf(dev,
6802			    "queue equality override not set, capping rx_queues at %d and tx_queues at %d\n",
6803			    min(rx_queues, tx_queues), min(rx_queues, tx_queues));
6804#endif
6805		tx_queues = min(rx_queues, tx_queues);
6806		rx_queues = min(rx_queues, tx_queues);
6807	}
6808
6809	vectors = rx_queues + admincnt;
6810	if (msgs < vectors) {
6811		device_printf(dev,
6812		    "insufficient number of MSI-X vectors "
6813		    "(supported %d, need %d)\n", msgs, vectors);
6814		goto msi;
6815	}
6816
6817	device_printf(dev, "Using %d RX queues %d TX queues\n", rx_queues,
6818	    tx_queues);
6819	msgs = vectors;
6820	if ((err = pci_alloc_msix(dev, &vectors)) == 0) {
6821		if (vectors != msgs) {
6822			device_printf(dev,
6823			    "Unable to allocate sufficient MSI-X vectors "
6824			    "(got %d, need %d)\n", vectors, msgs);
6825			pci_release_msi(dev);
6826			if (bar != -1) {
6827				bus_release_resource(dev, SYS_RES_MEMORY, bar,
6828				    ctx->ifc_msix_mem);
6829				ctx->ifc_msix_mem = NULL;
6830			}
6831			goto msi;
6832		}
6833		device_printf(dev, "Using MSI-X interrupts with %d vectors\n",
6834		    vectors);
6835		scctx->isc_vectors = vectors;
6836		scctx->isc_nrxqsets = rx_queues;
6837		scctx->isc_ntxqsets = tx_queues;
6838		scctx->isc_intr = IFLIB_INTR_MSIX;
6839
6840		return (vectors);
6841	} else {
6842		device_printf(dev,
6843		    "failed to allocate %d MSI-X vectors, err: %d\n", vectors,
6844		    err);
6845		if (bar != -1) {
6846			bus_release_resource(dev, SYS_RES_MEMORY, bar,
6847			    ctx->ifc_msix_mem);
6848			ctx->ifc_msix_mem = NULL;
6849		}
6850	}
6851
6852msi:
6853	vectors = pci_msi_count(dev);
6854	scctx->isc_nrxqsets = 1;
6855	scctx->isc_ntxqsets = 1;
6856	scctx->isc_vectors = vectors;
6857	if (vectors == 1 && pci_alloc_msi(dev, &vectors) == 0) {
6858		device_printf(dev,"Using an MSI interrupt\n");
6859		scctx->isc_intr = IFLIB_INTR_MSI;
6860	} else {
6861		scctx->isc_vectors = 1;
6862		device_printf(dev,"Using a Legacy interrupt\n");
6863		scctx->isc_intr = IFLIB_INTR_LEGACY;
6864	}
6865
6866	return (vectors);
6867}
6868
6869static const char *ring_states[] = { "IDLE", "BUSY", "STALLED", "ABDICATED" };
6870
6871#ifndef __HAIKU__
6872static int
6873mp_ring_state_handler(SYSCTL_HANDLER_ARGS)
6874{
6875	int rc;
6876	uint16_t *state = ((uint16_t *)oidp->oid_arg1);
6877	struct sbuf *sb;
6878	const char *ring_state = "UNKNOWN";
6879
6880	/* XXX needed ? */
6881	rc = sysctl_wire_old_buffer(req, 0);
6882	MPASS(rc == 0);
6883	if (rc != 0)
6884		return (rc);
6885	sb = sbuf_new_for_sysctl(NULL, NULL, 80, req);
6886	MPASS(sb != NULL);
6887	if (sb == NULL)
6888		return (ENOMEM);
6889	if (state[3] <= 3)
6890		ring_state = ring_states[state[3]];
6891
6892	sbuf_printf(sb, "pidx_head: %04hd pidx_tail: %04hd cidx: %04hd state: %s",
6893		    state[0], state[1], state[2], ring_state);
6894	rc = sbuf_finish(sb);
6895	sbuf_delete(sb);
6896        return(rc);
6897}
6898#endif
6899
6900enum iflib_ndesc_handler {
6901	IFLIB_NTXD_HANDLER,
6902	IFLIB_NRXD_HANDLER,
6903};
6904
6905static int
6906mp_ndesc_handler(SYSCTL_HANDLER_ARGS)
6907{
6908	if_ctx_t ctx = (void *)arg1;
6909	enum iflib_ndesc_handler type = arg2;
6910	char buf[256] = {0};
6911	qidx_t *ndesc;
6912	char *p, *next;
6913	int nqs, rc, i;
6914
6915	nqs = 8;
6916	switch(type) {
6917	case IFLIB_NTXD_HANDLER:
6918		ndesc = ctx->ifc_sysctl_ntxds;
6919		if (ctx->ifc_sctx)
6920			nqs = ctx->ifc_sctx->isc_ntxqs;
6921		break;
6922	case IFLIB_NRXD_HANDLER:
6923		ndesc = ctx->ifc_sysctl_nrxds;
6924		if (ctx->ifc_sctx)
6925			nqs = ctx->ifc_sctx->isc_nrxqs;
6926		break;
6927	default:
6928		printf("%s: unhandled type\n", __func__);
6929		return (EINVAL);
6930	}
6931	if (nqs == 0)
6932		nqs = 8;
6933
6934	for (i=0; i<8; i++) {
6935		if (i >= nqs)
6936			break;
6937		if (i)
6938			strcat(buf, ",");
6939		sprintf(strchr(buf, 0), "%d", ndesc[i]);
6940	}
6941
6942	rc = sysctl_handle_string(oidp, buf, sizeof(buf), req);
6943	if (rc || req->newptr == NULL)
6944		return rc;
6945
6946	for (i = 0, next = buf, p = strsep(&next, " ,"); i < 8 && p;
6947	    i++, p = strsep(&next, " ,")) {
6948		ndesc[i] = strtoul(p, NULL, 10);
6949	}
6950
6951	return(rc);
6952}
6953
6954#define NAME_BUFLEN 32
6955static void
6956iflib_add_device_sysctl_pre(if_ctx_t ctx)
6957{
6958#ifndef __HAIKU__
6959        device_t dev = iflib_get_dev(ctx);
6960	struct sysctl_oid_list *child, *oid_list;
6961	struct sysctl_ctx_list *ctx_list;
6962	struct sysctl_oid *node;
6963
6964	ctx_list = device_get_sysctl_ctx(dev);
6965	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
6966	ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, "iflib",
6967	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "IFLIB fields");
6968	oid_list = SYSCTL_CHILDREN(node);
6969
6970	SYSCTL_ADD_CONST_STRING(ctx_list, oid_list, OID_AUTO, "driver_version",
6971		       CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version,
6972		       "driver version");
6973
6974	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs",
6975		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0,
6976			"# of txqs to use, 0 => use default #");
6977	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs",
6978		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0,
6979			"# of rxqs to use, 0 => use default #");
6980	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable",
6981		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0,
6982                       "permit #txq != #rxq");
6983	SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "disable_msix",
6984                      CTLFLAG_RWTUN, &ctx->ifc_softc_ctx.isc_disable_msix, 0,
6985                      "disable MSI-X (default 0)");
6986	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "rx_budget",
6987		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_rx_budget, 0,
6988		       "set the RX budget");
6989	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "tx_abdicate",
6990		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_tx_abdicate, 0,
6991		       "cause TX to abdicate instead of running to completion");
6992	ctx->ifc_sysctl_core_offset = CORE_OFFSET_UNSPECIFIED;
6993	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "core_offset",
6994		       CTLFLAG_RDTUN, &ctx->ifc_sysctl_core_offset, 0,
6995		       "offset to start using cores at");
6996	SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "separate_txrx",
6997		       CTLFLAG_RDTUN, &ctx->ifc_sysctl_separate_txrx, 0,
6998		       "use separate cores for TX and RX");
6999	SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "use_logical_cores",
7000		      CTLFLAG_RDTUN, &ctx->ifc_sysctl_use_logical_cores, 0,
7001		      "try to make use of logical cores for TX and RX");
7002
7003	/* XXX change for per-queue sizes */
7004	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
7005	    CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, ctx,
7006	    IFLIB_NTXD_HANDLER, mp_ndesc_handler, "A",
7007	    "list of # of TX descriptors to use, 0 = use default #");
7008	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds",
7009	    CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, ctx,
7010	    IFLIB_NRXD_HANDLER, mp_ndesc_handler, "A",
7011	    "list of # of RX descriptors to use, 0 = use default #");
7012#endif
7013}
7014
7015static void
7016iflib_add_device_sysctl_post(if_ctx_t ctx)
7017{
7018#ifndef __HAIKU__
7019	if_shared_ctx_t sctx = ctx->ifc_sctx;
7020	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
7021        device_t dev = iflib_get_dev(ctx);
7022	struct sysctl_oid_list *child;
7023	struct sysctl_ctx_list *ctx_list;
7024	iflib_fl_t fl;
7025	iflib_txq_t txq;
7026	iflib_rxq_t rxq;
7027	int i, j;
7028	char namebuf[NAME_BUFLEN];
7029	char *qfmt;
7030	struct sysctl_oid *queue_node, *fl_node, *node;
7031	struct sysctl_oid_list *queue_list, *fl_list;
7032	ctx_list = device_get_sysctl_ctx(dev);
7033
7034	node = ctx->ifc_sysctl_node;
7035	child = SYSCTL_CHILDREN(node);
7036
7037	if (scctx->isc_ntxqsets > 100)
7038		qfmt = "txq%03d";
7039	else if (scctx->isc_ntxqsets > 10)
7040		qfmt = "txq%02d";
7041	else
7042		qfmt = "txq%d";
7043	for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) {
7044		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
7045		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
7046		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
7047		queue_list = SYSCTL_CHILDREN(queue_node);
7048		SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu",
7049			       CTLFLAG_RD,
7050			       &txq->ift_task.gt_cpu, 0, "cpu this queue is bound to");
7051#if MEMORY_LOGGING
7052		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued",
7053				CTLFLAG_RD,
7054				&txq->ift_dequeued, "total mbufs freed");
7055		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_enqueued",
7056				CTLFLAG_RD,
7057				&txq->ift_enqueued, "total mbufs enqueued");
7058#endif
7059		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag",
7060				   CTLFLAG_RD,
7061				   &txq->ift_mbuf_defrag, "# of times m_defrag was called");
7062		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "m_pullups",
7063				   CTLFLAG_RD,
7064				   &txq->ift_pullups, "# of times m_pullup was called");
7065		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag_failed",
7066				   CTLFLAG_RD,
7067				   &txq->ift_mbuf_defrag_failed, "# of times m_defrag failed");
7068		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_desc_avail",
7069				   CTLFLAG_RD,
7070				   &txq->ift_no_desc_avail, "# of times no descriptors were available");
7071		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "tx_map_failed",
7072				   CTLFLAG_RD,
7073				   &txq->ift_map_failed, "# of times DMA map failed");
7074		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txd_encap_efbig",
7075				   CTLFLAG_RD,
7076				   &txq->ift_txd_encap_efbig, "# of times txd_encap returned EFBIG");
7077		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_tx_dma_setup",
7078				   CTLFLAG_RD,
7079				   &txq->ift_no_tx_dma_setup, "# of times map failed for other than EFBIG");
7080		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_pidx",
7081				   CTLFLAG_RD,
7082				   &txq->ift_pidx, 1, "Producer Index");
7083		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx",
7084				   CTLFLAG_RD,
7085				   &txq->ift_cidx, 1, "Consumer Index");
7086		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx_processed",
7087				   CTLFLAG_RD,
7088				   &txq->ift_cidx_processed, 1, "Consumer Index seen by credit update");
7089		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_in_use",
7090				   CTLFLAG_RD,
7091				   &txq->ift_in_use, 1, "descriptors in use");
7092		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_processed",
7093				   CTLFLAG_RD,
7094				   &txq->ift_processed, "descriptors procesed for clean");
7095		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_cleaned",
7096				   CTLFLAG_RD,
7097				   &txq->ift_cleaned, "total cleaned");
7098		SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state",
7099		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
7100		    __DEVOLATILE(uint64_t *, &txq->ift_br->state), 0,
7101		    mp_ring_state_handler, "A", "soft ring state");
7102		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_enqueues",
7103				       CTLFLAG_RD, &txq->ift_br->enqueues,
7104				       "# of enqueues to the mp_ring for this queue");
7105		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_drops",
7106				       CTLFLAG_RD, &txq->ift_br->drops,
7107				       "# of drops in the mp_ring for this queue");
7108		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_starts",
7109				       CTLFLAG_RD, &txq->ift_br->starts,
7110				       "# of normal consumer starts in the mp_ring for this queue");
7111		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_stalls",
7112				       CTLFLAG_RD, &txq->ift_br->stalls,
7113					       "# of consumer stalls in the mp_ring for this queue");
7114		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_restarts",
7115			       CTLFLAG_RD, &txq->ift_br->restarts,
7116				       "# of consumer restarts in the mp_ring for this queue");
7117		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_abdications",
7118				       CTLFLAG_RD, &txq->ift_br->abdications,
7119				       "# of consumer abdications in the mp_ring for this queue");
7120	}
7121
7122	if (scctx->isc_nrxqsets > 100)
7123		qfmt = "rxq%03d";
7124	else if (scctx->isc_nrxqsets > 10)
7125		qfmt = "rxq%02d";
7126	else
7127		qfmt = "rxq%d";
7128	for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) {
7129		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
7130		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
7131		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
7132		queue_list = SYSCTL_CHILDREN(queue_node);
7133		SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu",
7134			       CTLFLAG_RD,
7135			       &rxq->ifr_task.gt_cpu, 0, "cpu this queue is bound to");
7136		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
7137			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx",
7138				       CTLFLAG_RD,
7139				       &rxq->ifr_cq_cidx, 1, "Consumer Index");
7140		}
7141
7142		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
7143			snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j);
7144			fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list, OID_AUTO, namebuf,
7145			    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "freelist Name");
7146			fl_list = SYSCTL_CHILDREN(fl_node);
7147			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "pidx",
7148				       CTLFLAG_RD,
7149				       &fl->ifl_pidx, 1, "Producer Index");
7150			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "cidx",
7151				       CTLFLAG_RD,
7152				       &fl->ifl_cidx, 1, "Consumer Index");
7153			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "credits",
7154				       CTLFLAG_RD,
7155				       &fl->ifl_credits, 1, "credits available");
7156			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "buf_size",
7157				       CTLFLAG_RD,
7158				       &fl->ifl_buf_size, 1, "buffer size");
7159#if MEMORY_LOGGING
7160			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_enqueued",
7161					CTLFLAG_RD,
7162					&fl->ifl_m_enqueued, "mbufs allocated");
7163			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_dequeued",
7164					CTLFLAG_RD,
7165					&fl->ifl_m_dequeued, "mbufs freed");
7166			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_enqueued",
7167					CTLFLAG_RD,
7168					&fl->ifl_cl_enqueued, "clusters allocated");
7169			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_dequeued",
7170					CTLFLAG_RD,
7171					&fl->ifl_cl_dequeued, "clusters freed");
7172#endif
7173		}
7174	}
7175#endif
7176}
7177
7178void
7179iflib_request_reset(if_ctx_t ctx)
7180{
7181
7182	STATE_LOCK(ctx);
7183	ctx->ifc_flags |= IFC_DO_RESET;
7184	STATE_UNLOCK(ctx);
7185}
7186
7187#ifndef __NO_STRICT_ALIGNMENT
7188static struct mbuf *
7189iflib_fixup_rx(struct mbuf *m)
7190{
7191	struct mbuf *n;
7192
7193	if (m->m_len <= (MCLBYTES - ETHER_HDR_LEN)) {
7194		bcopy(m->m_data, m->m_data + ETHER_HDR_LEN, m->m_len);
7195		m->m_data += ETHER_HDR_LEN;
7196		n = m;
7197	} else {
7198		MGETHDR(n, M_NOWAIT, MT_DATA);
7199		if (n == NULL) {
7200			m_freem(m);
7201			return (NULL);
7202		}
7203		bcopy(m->m_data, n->m_data, ETHER_HDR_LEN);
7204		m->m_data += ETHER_HDR_LEN;
7205		m->m_len -= ETHER_HDR_LEN;
7206		n->m_len = ETHER_HDR_LEN;
7207		M_MOVE_PKTHDR(n, m);
7208		n->m_next = m;
7209	}
7210	return (n);
7211}
7212#endif
7213
7214#ifdef DEBUGNET
7215static void
7216iflib_debugnet_init(if_t ifp, int *nrxr, int *ncl, int *clsize)
7217{
7218	if_ctx_t ctx;
7219
7220	ctx = if_getsoftc(ifp);
7221	CTX_LOCK(ctx);
7222	*nrxr = NRXQSETS(ctx);
7223	*ncl = ctx->ifc_rxqs[0].ifr_fl->ifl_size;
7224	*clsize = ctx->ifc_rxqs[0].ifr_fl->ifl_buf_size;
7225	CTX_UNLOCK(ctx);
7226}
7227
7228static void
7229iflib_debugnet_event(if_t ifp, enum debugnet_ev event)
7230{
7231	if_ctx_t ctx;
7232	if_softc_ctx_t scctx;
7233	iflib_fl_t fl;
7234	iflib_rxq_t rxq;
7235	int i, j;
7236
7237	ctx = if_getsoftc(ifp);
7238	scctx = &ctx->ifc_softc_ctx;
7239
7240	switch (event) {
7241	case DEBUGNET_START:
7242#ifndef __HAIKU__
7243		for (i = 0; i < scctx->isc_nrxqsets; i++) {
7244			rxq = &ctx->ifc_rxqs[i];
7245			for (j = 0; j < rxq->ifr_nfl; j++) {
7246				fl = rxq->ifr_fl;
7247				fl->ifl_zone = m_getzone(fl->ifl_buf_size);
7248			}
7249		}
7250		iflib_no_tx_batch = 1;
7251		break;
7252#endif
7253	default:
7254		break;
7255	}
7256}
7257
7258static int
7259iflib_debugnet_transmit(if_t ifp, struct mbuf *m)
7260{
7261	if_ctx_t ctx;
7262	iflib_txq_t txq;
7263	int error;
7264
7265	ctx = if_getsoftc(ifp);
7266	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
7267	    IFF_DRV_RUNNING)
7268		return (EBUSY);
7269
7270	txq = &ctx->ifc_txqs[0];
7271	error = iflib_encap(txq, &m);
7272	if (error == 0)
7273		(void)iflib_txd_db_check(txq, true);
7274	return (error);
7275}
7276
7277static int
7278iflib_debugnet_poll(if_t ifp, int count)
7279{
7280	struct epoch_tracker et;
7281	if_ctx_t ctx;
7282	if_softc_ctx_t scctx;
7283	iflib_txq_t txq;
7284	int i;
7285
7286	ctx = if_getsoftc(ifp);
7287	scctx = &ctx->ifc_softc_ctx;
7288
7289	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
7290	    IFF_DRV_RUNNING)
7291		return (EBUSY);
7292
7293	txq = &ctx->ifc_txqs[0];
7294	(void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
7295
7296	NET_EPOCH_ENTER(et);
7297	for (i = 0; i < scctx->isc_nrxqsets; i++)
7298		(void)iflib_rxeof(&ctx->ifc_rxqs[i], 16 /* XXX */);
7299	NET_EPOCH_EXIT(et);
7300	return (0);
7301}
7302#endif /* DEBUGNET */
7303