iflib.c revision 332288
1/*-
2 * Copyright (c) 2014-2016, Matthew Macy <mmacy@nextbsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 *  1. Redistributions of source code must retain the above copyright notice,
9 *     this list of conditions and the following disclaimer.
10 *
11 *  2. Neither the name of Matthew Macy nor the names of its
12 *     contributors may be used to endorse or promote products derived from
13 *     this software without specific prior written permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 * POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/11/sys/net/iflib.c 332288 2018-04-08 16:54:07Z brooks $");
30
31#include "opt_inet.h"
32#include "opt_inet6.h"
33#include "opt_acpi.h"
34
35#include <sys/param.h>
36#include <sys/types.h>
37#include <sys/bus.h>
38#include <sys/eventhandler.h>
39#include <sys/sockio.h>
40#include <sys/kernel.h>
41#include <sys/lock.h>
42#include <sys/mutex.h>
43#include <sys/module.h>
44#include <sys/kobj.h>
45#include <sys/rman.h>
46#include <sys/sbuf.h>
47#include <sys/smp.h>
48#include <sys/socket.h>
49#include <sys/sysctl.h>
50#include <sys/syslog.h>
51#include <sys/taskqueue.h>
52#include <sys/limits.h>
53
54
55#include <net/if.h>
56#include <net/if_var.h>
57#include <net/if_types.h>
58#include <net/if_media.h>
59#include <net/bpf.h>
60#include <net/ethernet.h>
61#include <net/mp_ring.h>
62
63#include <netinet/in.h>
64#include <netinet/in_pcb.h>
65#include <netinet/tcp_lro.h>
66#include <netinet/in_systm.h>
67#include <netinet/if_ether.h>
68#include <netinet/ip.h>
69#include <netinet/ip6.h>
70#include <netinet/tcp.h>
71
72#include <machine/bus.h>
73#include <machine/in_cksum.h>
74
75#include <vm/vm.h>
76#include <vm/pmap.h>
77
78#include <dev/led/led.h>
79#include <dev/pci/pcireg.h>
80#include <dev/pci/pcivar.h>
81#include <dev/pci/pci_private.h>
82
83#include <net/iflib.h>
84
85#include "ifdi_if.h"
86
87#if defined(__i386__) || defined(__amd64__)
88#include <sys/memdesc.h>
89#include <machine/bus.h>
90#include <machine/md_var.h>
91#include <machine/specialreg.h>
92#include <x86/include/busdma_impl.h>
93#include <x86/iommu/busdma_dmar.h>
94#endif
95
96
97/*
98 * enable accounting of every mbuf as it comes in to and goes out of iflib's software descriptor references
99 */
100#define MEMORY_LOGGING 0
101/*
102 * Enable mbuf vectors for compressing long mbuf chains
103 */
104
105/*
106 * NB:
107 * - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead
108 *   we prefetch needs to be determined by the time spent in m_free vis a vis
109 *   the cost of a prefetch. This will of course vary based on the workload:
110 *      - NFLX's m_free path is dominated by vm-based M_EXT manipulation which
111 *        is quite expensive, thus suggesting very little prefetch.
112 *      - small packet forwarding which is just returning a single mbuf to
113 *        UMA will typically be very fast vis a vis the cost of a memory
114 *        access.
115 */
116
117
118/*
119 * File organization:
120 *  - private structures
121 *  - iflib private utility functions
122 *  - ifnet functions
123 *  - vlan registry and other exported functions
124 *  - iflib public core functions
125 *
126 *
127 */
128static MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library");
129
130struct iflib_txq;
131typedef struct iflib_txq *iflib_txq_t;
132struct iflib_rxq;
133typedef struct iflib_rxq *iflib_rxq_t;
134struct iflib_fl;
135typedef struct iflib_fl *iflib_fl_t;
136
137typedef struct iflib_filter_info {
138	driver_filter_t *ifi_filter;
139	void *ifi_filter_arg;
140	struct grouptask *ifi_task;
141} *iflib_filter_info_t;
142
143struct iflib_ctx {
144	KOBJ_FIELDS;
145   /*
146   * Pointer to hardware driver's softc
147   */
148	void *ifc_softc;
149	device_t ifc_dev;
150	if_t ifc_ifp;
151
152	cpuset_t ifc_cpus;
153	if_shared_ctx_t ifc_sctx;
154	struct if_softc_ctx ifc_softc_ctx;
155
156	struct mtx ifc_mtx;
157
158	uint16_t ifc_nhwtxqs;
159	uint16_t ifc_nhwrxqs;
160
161	iflib_txq_t ifc_txqs;
162	iflib_rxq_t ifc_rxqs;
163	uint32_t ifc_if_flags;
164	uint32_t ifc_flags;
165	uint32_t ifc_max_fl_buf_size;
166	int ifc_in_detach;
167
168	int ifc_link_state;
169	int ifc_link_irq;
170	int ifc_pause_frames;
171	int ifc_watchdog_events;
172	struct cdev *ifc_led_dev;
173	struct resource *ifc_msix_mem;
174
175	struct if_irq ifc_legacy_irq;
176	struct grouptask ifc_admin_task;
177	struct grouptask ifc_vflr_task;
178	struct iflib_filter_info ifc_filter_info;
179	struct ifmedia	ifc_media;
180
181	struct sysctl_oid *ifc_sysctl_node;
182	uint16_t ifc_sysctl_ntxqs;
183	uint16_t ifc_sysctl_nrxqs;
184	uint16_t ifc_sysctl_qs_eq_override;
185
186	uint16_t ifc_sysctl_ntxds[8];
187	uint16_t ifc_sysctl_nrxds[8];
188	struct if_txrx ifc_txrx;
189#define isc_txd_encap  ifc_txrx.ift_txd_encap
190#define isc_txd_flush  ifc_txrx.ift_txd_flush
191#define isc_txd_credits_update  ifc_txrx.ift_txd_credits_update
192#define isc_rxd_available ifc_txrx.ift_rxd_available
193#define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get
194#define isc_rxd_refill ifc_txrx.ift_rxd_refill
195#define isc_rxd_flush ifc_txrx.ift_rxd_flush
196#define isc_rxd_refill ifc_txrx.ift_rxd_refill
197#define isc_rxd_refill ifc_txrx.ift_rxd_refill
198#define isc_legacy_intr ifc_txrx.ift_legacy_intr
199	eventhandler_tag ifc_vlan_attach_event;
200	eventhandler_tag ifc_vlan_detach_event;
201	uint8_t ifc_mac[ETHER_ADDR_LEN];
202	char ifc_mtx_name[16];
203};
204
205
206void *
207iflib_get_softc(if_ctx_t ctx)
208{
209
210	return (ctx->ifc_softc);
211}
212
213device_t
214iflib_get_dev(if_ctx_t ctx)
215{
216
217	return (ctx->ifc_dev);
218}
219
220if_t
221iflib_get_ifp(if_ctx_t ctx)
222{
223
224	return (ctx->ifc_ifp);
225}
226
227struct ifmedia *
228iflib_get_media(if_ctx_t ctx)
229{
230
231	return (&ctx->ifc_media);
232}
233
234void
235iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN])
236{
237
238	bcopy(mac, ctx->ifc_mac, ETHER_ADDR_LEN);
239}
240
241if_softc_ctx_t
242iflib_get_softc_ctx(if_ctx_t ctx)
243{
244
245	return (&ctx->ifc_softc_ctx);
246}
247
248if_shared_ctx_t
249iflib_get_sctx(if_ctx_t ctx)
250{
251
252	return (ctx->ifc_sctx);
253}
254
255#define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*))
256
257#define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP)
258#define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
259
260#define RX_SW_DESC_MAP_CREATED	(1 << 0)
261#define TX_SW_DESC_MAP_CREATED	(1 << 1)
262#define RX_SW_DESC_INUSE        (1 << 3)
263#define TX_SW_DESC_MAPPED       (1 << 4)
264
265typedef struct iflib_sw_rx_desc {
266	bus_dmamap_t    ifsd_map;         /* bus_dma map for packet */
267	struct mbuf    *ifsd_m;           /* rx: uninitialized mbuf */
268	caddr_t         ifsd_cl;          /* direct cluster pointer for rx */
269	uint16_t	ifsd_flags;
270} *iflib_rxsd_t;
271
272typedef struct iflib_sw_tx_desc_val {
273	bus_dmamap_t    ifsd_map;         /* bus_dma map for packet */
274	struct mbuf    *ifsd_m;           /* pkthdr mbuf */
275	uint8_t		ifsd_flags;
276} *iflib_txsd_val_t;
277
278typedef struct iflib_sw_tx_desc_array {
279	bus_dmamap_t    *ifsd_map;         /* bus_dma maps for packet */
280	struct mbuf    **ifsd_m;           /* pkthdr mbufs */
281	uint8_t		*ifsd_flags;
282} iflib_txsd_array_t;
283
284
285/* magic number that should be high enough for any hardware */
286#define IFLIB_MAX_TX_SEGS		128
287#define IFLIB_MAX_RX_SEGS		32
288#define IFLIB_RX_COPY_THRESH		128
289#define IFLIB_MAX_RX_REFRESH		32
290#define IFLIB_QUEUE_IDLE		0
291#define IFLIB_QUEUE_HUNG		1
292#define IFLIB_QUEUE_WORKING		2
293
294/* this should really scale with ring size - 32 is a fairly arbitrary value for this */
295#define TX_BATCH_SIZE			16
296
297#define IFLIB_RESTART_BUDGET		8
298
299#define	IFC_LEGACY		0x01
300#define	IFC_QFLUSH		0x02
301#define	IFC_MULTISEG		0x04
302#define	IFC_DMAR		0x08
303#define	IFC_SC_ALLOCATED	0x10
304
305#define CSUM_OFFLOAD		(CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \
306				 CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \
307				 CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP)
308struct iflib_txq {
309	uint16_t	ift_in_use;
310	uint16_t	ift_cidx;
311	uint16_t	ift_cidx_processed;
312	uint16_t	ift_pidx;
313	uint8_t		ift_gen;
314	uint8_t		ift_db_pending;
315	uint8_t		ift_db_pending_queued;
316	uint8_t		ift_npending;
317	uint8_t		ift_br_offset;
318	/* implicit pad */
319	uint64_t	ift_processed;
320	uint64_t	ift_cleaned;
321#if MEMORY_LOGGING
322	uint64_t	ift_enqueued;
323	uint64_t	ift_dequeued;
324#endif
325	uint64_t	ift_no_tx_dma_setup;
326	uint64_t	ift_no_desc_avail;
327	uint64_t	ift_mbuf_defrag_failed;
328	uint64_t	ift_mbuf_defrag;
329	uint64_t	ift_map_failed;
330	uint64_t	ift_txd_encap_efbig;
331	uint64_t	ift_pullups;
332
333	struct mtx	ift_mtx;
334	struct mtx	ift_db_mtx;
335
336	/* constant values */
337	if_ctx_t	ift_ctx;
338	struct ifmp_ring        **ift_br;
339	struct grouptask	ift_task;
340	uint16_t	ift_size;
341	uint16_t	ift_id;
342	struct callout	ift_timer;
343	struct callout	ift_db_check;
344
345	iflib_txsd_array_t	ift_sds;
346	uint8_t			ift_nbr;
347	uint8_t			ift_qstatus;
348	uint8_t			ift_active;
349	uint8_t			ift_closed;
350	int			ift_watchdog_time;
351	struct iflib_filter_info ift_filter_info;
352	bus_dma_tag_t		ift_desc_tag;
353	bus_dma_tag_t		ift_tso_desc_tag;
354	iflib_dma_info_t	ift_ifdi;
355#define MTX_NAME_LEN 16
356	char                    ift_mtx_name[MTX_NAME_LEN];
357	char                    ift_db_mtx_name[MTX_NAME_LEN];
358	bus_dma_segment_t	ift_segs[IFLIB_MAX_TX_SEGS]  __aligned(CACHE_LINE_SIZE);
359} __aligned(CACHE_LINE_SIZE);
360
361struct iflib_fl {
362	uint16_t	ifl_cidx;
363	uint16_t	ifl_pidx;
364	uint16_t	ifl_credits;
365	uint8_t		ifl_gen;
366#if MEMORY_LOGGING
367	uint64_t	ifl_m_enqueued;
368	uint64_t	ifl_m_dequeued;
369	uint64_t	ifl_cl_enqueued;
370	uint64_t	ifl_cl_dequeued;
371#endif
372	/* implicit pad */
373
374	/* constant */
375	uint16_t	ifl_size;
376	uint16_t	ifl_buf_size;
377	uint16_t	ifl_cltype;
378	uma_zone_t	ifl_zone;
379	iflib_rxsd_t	ifl_sds;
380	iflib_rxq_t	ifl_rxq;
381	uint8_t		ifl_id;
382	bus_dma_tag_t           ifl_desc_tag;
383	iflib_dma_info_t	ifl_ifdi;
384	uint64_t	ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE);
385	caddr_t		ifl_vm_addrs[IFLIB_MAX_RX_REFRESH];
386}  __aligned(CACHE_LINE_SIZE);
387
388static inline int
389get_inuse(int size, int cidx, int pidx, int gen)
390{
391	int used;
392
393	if (pidx > cidx)
394		used = pidx - cidx;
395	else if (pidx < cidx)
396		used = size - cidx + pidx;
397	else if (gen == 0 && pidx == cidx)
398		used = 0;
399	else if (gen == 1 && pidx == cidx)
400		used = size;
401	else
402		panic("bad state");
403
404	return (used);
405}
406
407#define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen))
408
409#define IDXDIFF(head, tail, wrap) \
410	((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
411
412struct iflib_rxq {
413	/* If there is a separate completion queue -
414	 * these are the cq cidx and pidx. Otherwise
415	 * these are unused.
416	 */
417	uint16_t	ifr_size;
418	uint16_t	ifr_cq_cidx;
419	uint16_t	ifr_cq_pidx;
420	uint8_t		ifr_cq_gen;
421	uint8_t		ifr_fl_offset;
422
423	if_ctx_t	ifr_ctx;
424	iflib_fl_t	ifr_fl;
425	uint64_t	ifr_rx_irq;
426	uint16_t	ifr_id;
427	uint8_t		ifr_lro_enabled;
428	uint8_t		ifr_nfl;
429	struct lro_ctrl			ifr_lc;
430	struct grouptask        ifr_task;
431	struct iflib_filter_info ifr_filter_info;
432	iflib_dma_info_t		ifr_ifdi;
433	/* dynamically allocate if any drivers need a value substantially larger than this */
434	struct if_rxd_frag	ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE);
435}  __aligned(CACHE_LINE_SIZE);
436
437/*
438 * Only allow a single packet to take up most 1/nth of the tx ring
439 */
440#define MAX_SINGLE_PACKET_FRACTION 12
441#define IF_BAD_DMA (bus_addr_t)-1
442
443static int enable_msix = 1;
444
445#define mtx_held(m)	(((m)->mtx_lock & ~MTX_FLAGMASK) != (uintptr_t)0)
446
447
448
449#define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING))
450
451#define CTX_LOCK_INIT(_sc, _name)  mtx_init(&(_sc)->ifc_mtx, _name, "iflib ctx lock", MTX_DEF)
452
453#define CTX_LOCK(ctx) mtx_lock(&(ctx)->ifc_mtx)
454#define CTX_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_mtx)
455#define CTX_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_mtx)
456
457
458#define TXDB_LOCK_INIT(txq)  mtx_init(&(txq)->ift_db_mtx, (txq)->ift_db_mtx_name, NULL, MTX_DEF)
459#define TXDB_TRYLOCK(txq) mtx_trylock(&(txq)->ift_db_mtx)
460#define TXDB_LOCK(txq) mtx_lock(&(txq)->ift_db_mtx)
461#define TXDB_UNLOCK(txq) mtx_unlock(&(txq)->ift_db_mtx)
462#define TXDB_LOCK_DESTROY(txq) mtx_destroy(&(txq)->ift_db_mtx)
463
464#define CALLOUT_LOCK(txq)	mtx_lock(&txq->ift_mtx)
465#define CALLOUT_UNLOCK(txq) 	mtx_unlock(&txq->ift_mtx)
466
467
468/* Our boot-time initialization hook */
469static int	iflib_module_event_handler(module_t, int, void *);
470
471static moduledata_t iflib_moduledata = {
472	"iflib",
473	iflib_module_event_handler,
474	NULL
475};
476
477DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY);
478MODULE_VERSION(iflib, 1);
479
480MODULE_DEPEND(iflib, pci, 1, 1, 1);
481MODULE_DEPEND(iflib, ether, 1, 1, 1);
482
483TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
484
485#ifndef IFLIB_DEBUG_COUNTERS
486#ifdef INVARIANTS
487#define IFLIB_DEBUG_COUNTERS 1
488#else
489#define IFLIB_DEBUG_COUNTERS 0
490#endif /* !INVARIANTS */
491#endif
492
493static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD, 0,
494                   "iflib driver parameters");
495
496/*
497 * XXX need to ensure that this can't accidentally cause the head to be moved backwards
498 */
499static int iflib_min_tx_latency = 0;
500
501SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW,
502		   &iflib_min_tx_latency, 0, "minimize transmit latency at the possibel expense of throughput");
503
504
505#if IFLIB_DEBUG_COUNTERS
506
507static int iflib_tx_seen;
508static int iflib_tx_sent;
509static int iflib_tx_encap;
510static int iflib_rx_allocs;
511static int iflib_fl_refills;
512static int iflib_fl_refills_large;
513static int iflib_tx_frees;
514
515SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD,
516		   &iflib_tx_seen, 0, "# tx mbufs seen");
517SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD,
518		   &iflib_tx_sent, 0, "# tx mbufs sent");
519SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD,
520		   &iflib_tx_encap, 0, "# tx mbufs encapped");
521SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD,
522		   &iflib_tx_frees, 0, "# tx frees");
523SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD,
524		   &iflib_rx_allocs, 0, "# rx allocations");
525SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD,
526		   &iflib_fl_refills, 0, "# refills");
527SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD,
528		   &iflib_fl_refills_large, 0, "# large refills");
529
530
531static int iflib_txq_drain_flushing;
532static int iflib_txq_drain_oactive;
533static int iflib_txq_drain_notready;
534static int iflib_txq_drain_encapfail;
535
536SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD,
537		   &iflib_txq_drain_flushing, 0, "# drain flushes");
538SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD,
539		   &iflib_txq_drain_oactive, 0, "# drain oactives");
540SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD,
541		   &iflib_txq_drain_notready, 0, "# drain notready");
542SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_encapfail, CTLFLAG_RD,
543		   &iflib_txq_drain_encapfail, 0, "# drain encap fails");
544
545
546static int iflib_encap_load_mbuf_fail;
547static int iflib_encap_txq_avail_fail;
548static int iflib_encap_txd_encap_fail;
549
550SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD,
551		   &iflib_encap_load_mbuf_fail, 0, "# busdma load failures");
552SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD,
553		   &iflib_encap_txq_avail_fail, 0, "# txq avail failures");
554SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD,
555		   &iflib_encap_txd_encap_fail, 0, "# driver encap failures");
556
557static int iflib_task_fn_rxs;
558static int iflib_rx_intr_enables;
559static int iflib_fast_intrs;
560static int iflib_intr_link;
561static int iflib_intr_msix;
562static int iflib_rx_unavail;
563static int iflib_rx_ctx_inactive;
564static int iflib_rx_zero_len;
565static int iflib_rx_if_input;
566static int iflib_rx_mbuf_null;
567static int iflib_rxd_flush;
568
569static int iflib_verbose_debug;
570
571SYSCTL_INT(_net_iflib, OID_AUTO, intr_link, CTLFLAG_RD,
572		   &iflib_intr_link, 0, "# intr link calls");
573SYSCTL_INT(_net_iflib, OID_AUTO, intr_msix, CTLFLAG_RD,
574		   &iflib_intr_msix, 0, "# intr msix calls");
575SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD,
576		   &iflib_task_fn_rxs, 0, "# task_fn_rx calls");
577SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD,
578		   &iflib_rx_intr_enables, 0, "# rx intr enables");
579SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD,
580		   &iflib_fast_intrs, 0, "# fast_intr calls");
581SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD,
582		   &iflib_rx_unavail, 0, "# times rxeof called with no available data");
583SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD,
584		   &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context");
585SYSCTL_INT(_net_iflib, OID_AUTO, rx_zero_len, CTLFLAG_RD,
586		   &iflib_rx_zero_len, 0, "# times rxeof saw zero len mbuf");
587SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD,
588		   &iflib_rx_if_input, 0, "# times rxeof called if_input");
589SYSCTL_INT(_net_iflib, OID_AUTO, rx_mbuf_null, CTLFLAG_RD,
590		   &iflib_rx_mbuf_null, 0, "# times rxeof got null mbuf");
591SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD,
592	         &iflib_rxd_flush, 0, "# times rxd_flush called");
593SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
594		   &iflib_verbose_debug, 0, "enable verbose debugging");
595
596#define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1)
597
598#else
599#define DBG_COUNTER_INC(name)
600
601#endif
602
603
604
605#define IFLIB_DEBUG 0
606
607static void iflib_tx_structures_free(if_ctx_t ctx);
608static void iflib_rx_structures_free(if_ctx_t ctx);
609static int iflib_queues_alloc(if_ctx_t ctx);
610static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq);
611static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, int cidx, int budget);
612static int iflib_qset_structures_setup(if_ctx_t ctx);
613static int iflib_msix_init(if_ctx_t ctx);
614static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, char *str);
615static void iflib_txq_check_drain(iflib_txq_t txq, int budget);
616static uint32_t iflib_txq_can_drain(struct ifmp_ring *);
617static int iflib_register(if_ctx_t);
618static void iflib_init_locked(if_ctx_t ctx);
619static void iflib_add_device_sysctl_pre(if_ctx_t ctx);
620static void iflib_add_device_sysctl_post(if_ctx_t ctx);
621
622
623#ifdef DEV_NETMAP
624#include <sys/selinfo.h>
625#include <net/netmap.h>
626#include <dev/netmap/netmap_kern.h>
627
628MODULE_DEPEND(iflib, netmap, 1, 1, 1);
629
630/*
631 * device-specific sysctl variables:
632 *
633 * iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
634 *	During regular operations the CRC is stripped, but on some
635 *	hardware reception of frames not multiple of 64 is slower,
636 *	so using crcstrip=0 helps in benchmarks.
637 *
638 * iflib_rx_miss, iflib_rx_miss_bufs:
639 *	count packets that might be missed due to lost interrupts.
640 */
641SYSCTL_DECL(_dev_netmap);
642/*
643 * The xl driver by default strips CRCs and we do not override it.
644 */
645
646int iflib_crcstrip = 1;
647SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip,
648    CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on rx frames");
649
650int iflib_rx_miss, iflib_rx_miss_bufs;
651SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss,
652    CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed rx intr");
653SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs,
654    CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed rx intr bufs");
655
656/*
657 * Register/unregister. We are already under netmap lock.
658 * Only called on the first register or the last unregister.
659 */
660static int
661iflib_netmap_register(struct netmap_adapter *na, int onoff)
662{
663	struct ifnet *ifp = na->ifp;
664	if_ctx_t ctx = ifp->if_softc;
665
666	CTX_LOCK(ctx);
667	IFDI_INTR_DISABLE(ctx);
668
669	/* Tell the stack that the interface is no longer active */
670	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
671
672	if (!CTX_IS_VF(ctx))
673		IFDI_CRCSTRIP_SET(ctx, onoff);
674
675	/* enable or disable flags and callbacks in na and ifp */
676	if (onoff) {
677		nm_set_native_flags(na);
678	} else {
679		nm_clear_native_flags(na);
680	}
681	IFDI_INIT(ctx);
682	IFDI_CRCSTRIP_SET(ctx, onoff); // XXX why twice ?
683	CTX_UNLOCK(ctx);
684	return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1);
685}
686
687/*
688 * Reconcile kernel and user view of the transmit ring.
689 *
690 * All information is in the kring.
691 * Userspace wants to send packets up to the one before kring->rhead,
692 * kernel knows kring->nr_hwcur is the first unsent packet.
693 *
694 * Here we push packets out (as many as possible), and possibly
695 * reclaim buffers from previously completed transmission.
696 *
697 * The caller (netmap) guarantees that there is only one instance
698 * running at any time. Any interference with other driver
699 * methods should be handled by the individual drivers.
700 */
701static int
702iflib_netmap_txsync(struct netmap_kring *kring, int flags)
703{
704	struct netmap_adapter *na = kring->na;
705	struct ifnet *ifp = na->ifp;
706	struct netmap_ring *ring = kring->ring;
707	u_int nm_i;	/* index into the netmap ring */
708	u_int nic_i;	/* index into the NIC ring */
709	u_int n;
710	u_int const lim = kring->nkr_num_slots - 1;
711	u_int const head = kring->rhead;
712	struct if_pkt_info pi;
713
714	/*
715	 * interrupts on every tx packet are expensive so request
716	 * them every half ring, or where NS_REPORT is set
717	 */
718	u_int report_frequency = kring->nkr_num_slots >> 1;
719	/* device-specific */
720	if_ctx_t ctx = ifp->if_softc;
721	iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id];
722
723	pi.ipi_segs = txq->ift_segs;
724	pi.ipi_qsidx = kring->ring_id;
725	pi.ipi_ndescs = 0;
726
727	bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
728					BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
729
730
731	/*
732	 * First part: process new packets to send.
733	 * nm_i is the current index in the netmap ring,
734	 * nic_i is the corresponding index in the NIC ring.
735	 *
736	 * If we have packets to send (nm_i != head)
737	 * iterate over the netmap ring, fetch length and update
738	 * the corresponding slot in the NIC ring. Some drivers also
739	 * need to update the buffer's physical address in the NIC slot
740	 * even NS_BUF_CHANGED is not set (PNMB computes the addresses).
741	 *
742	 * The netmap_reload_map() calls is especially expensive,
743	 * even when (as in this case) the tag is 0, so do only
744	 * when the buffer has actually changed.
745	 *
746	 * If possible do not set the report/intr bit on all slots,
747	 * but only a few times per ring or when NS_REPORT is set.
748	 *
749	 * Finally, on 10G and faster drivers, it might be useful
750	 * to prefetch the next slot and txr entry.
751	 */
752
753	nm_i = kring->nr_hwcur;
754	if (nm_i != head) {	/* we have new packets to send */
755		nic_i = netmap_idx_k2n(kring, nm_i);
756
757		__builtin_prefetch(&ring->slot[nm_i]);
758		__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]);
759		__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
760
761		for (n = 0; nm_i != head; n++) {
762			struct netmap_slot *slot = &ring->slot[nm_i];
763			u_int len = slot->len;
764			uint64_t paddr;
765			void *addr = PNMB(na, slot, &paddr);
766			int flags = (slot->flags & NS_REPORT ||
767				nic_i == 0 || nic_i == report_frequency) ?
768				IPI_TX_INTR : 0;
769
770			/* device-specific */
771			pi.ipi_pidx = nic_i;
772			pi.ipi_flags = flags;
773
774			/* Fill the slot in the NIC ring. */
775			ctx->isc_txd_encap(ctx->ifc_softc, &pi);
776
777			/* prefetch for next round */
778			__builtin_prefetch(&ring->slot[nm_i + 1]);
779			__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]);
780			__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
781
782			NM_CHECK_ADDR_LEN(na, addr, len);
783
784			if (slot->flags & NS_BUF_CHANGED) {
785				/* buffer has changed, reload map */
786				netmap_reload_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[nic_i], addr);
787			}
788			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
789
790			/* make sure changes to the buffer are synced */
791			bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_sds.ifsd_map[nic_i],
792							BUS_DMASYNC_PREWRITE);
793
794			nm_i = nm_next(nm_i, lim);
795			nic_i = nm_next(nic_i, lim);
796		}
797		kring->nr_hwcur = head;
798
799		/* synchronize the NIC ring */
800		bus_dmamap_sync(txq->ift_desc_tag, txq->ift_ifdi->idi_map,
801						BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
802
803		/* (re)start the tx unit up to slot nic_i (excluded) */
804		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i);
805	}
806
807	/*
808	 * Second part: reclaim buffers for completed transmissions.
809	 */
810	if (iflib_tx_credits_update(ctx, txq)) {
811		/* some tx completed, increment avail */
812		nic_i = txq->ift_cidx_processed;
813		kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
814	}
815	return (0);
816}
817
818/*
819 * Reconcile kernel and user view of the receive ring.
820 * Same as for the txsync, this routine must be efficient.
821 * The caller guarantees a single invocations, but races against
822 * the rest of the driver should be handled here.
823 *
824 * On call, kring->rhead is the first packet that userspace wants
825 * to keep, and kring->rcur is the wakeup point.
826 * The kernel has previously reported packets up to kring->rtail.
827 *
828 * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
829 * of whether or not we received an interrupt.
830 */
831static int
832iflib_netmap_rxsync(struct netmap_kring *kring, int flags)
833{
834	struct netmap_adapter *na = kring->na;
835	struct ifnet *ifp = na->ifp;
836	struct netmap_ring *ring = kring->ring;
837	u_int nm_i;	/* index into the netmap ring */
838	u_int nic_i;	/* index into the NIC ring */
839	u_int i, n;
840	u_int const lim = kring->nkr_num_slots - 1;
841	u_int const head = kring->rhead;
842	int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
843	struct if_rxd_info ri;
844	/* device-specific */
845	if_ctx_t ctx = ifp->if_softc;
846	iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id];
847	iflib_fl_t fl = rxq->ifr_fl;
848	if (head > lim)
849		return netmap_ring_reinit(kring);
850
851	bzero(&ri, sizeof(ri));
852	ri.iri_qsidx = kring->ring_id;
853	ri.iri_ifp = ctx->ifc_ifp;
854	/* XXX check sync modes */
855	for (i = 0, fl = rxq->ifr_fl; i < rxq->ifr_nfl; i++, fl++)
856		bus_dmamap_sync(rxq->ifr_fl[i].ifl_desc_tag, fl->ifl_ifdi->idi_map,
857				BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
858
859	/*
860	 * First part: import newly received packets.
861	 *
862	 * nm_i is the index of the next free slot in the netmap ring,
863	 * nic_i is the index of the next received packet in the NIC ring,
864	 * and they may differ in case if_init() has been called while
865	 * in netmap mode. For the receive ring we have
866	 *
867	 *	nic_i = rxr->next_check;
868	 *	nm_i = kring->nr_hwtail (previous)
869	 * and
870	 *	nm_i == (nic_i + kring->nkr_hwofs) % ring_size
871	 *
872	 * rxr->next_check is set to 0 on a ring reinit
873	 */
874	if (netmap_no_pendintr || force_update) {
875		int crclen = iflib_crcstrip ? 0 : 4;
876		int error, avail;
877		uint16_t slot_flags = kring->nkr_slot_flags;
878
879		for (fl = rxq->ifr_fl, i = 0; i < rxq->ifr_nfl; i++, fl++) {
880			nic_i = fl->ifl_cidx;
881			nm_i = netmap_idx_n2k(kring, nic_i);
882			avail = ctx->isc_rxd_available(ctx->ifc_softc, kring->ring_id, nic_i, INT_MAX);
883			for (n = 0; avail > 0; n++, avail--) {
884				error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
885				if (error)
886					ring->slot[nm_i].len = 0;
887				else
888					ring->slot[nm_i].len = ri.iri_len - crclen;
889				ring->slot[nm_i].flags = slot_flags;
890				bus_dmamap_sync(fl->ifl_ifdi->idi_tag,
891								fl->ifl_sds[nic_i].ifsd_map, BUS_DMASYNC_POSTREAD);
892				nm_i = nm_next(nm_i, lim);
893				nic_i = nm_next(nic_i, lim);
894			}
895			if (n) { /* update the state variables */
896				if (netmap_no_pendintr && !force_update) {
897					/* diagnostics */
898					iflib_rx_miss ++;
899					iflib_rx_miss_bufs += n;
900				}
901				fl->ifl_cidx = nic_i;
902				kring->nr_hwtail = nm_i;
903			}
904			kring->nr_kflags &= ~NKR_PENDINTR;
905		}
906	}
907	/*
908	 * Second part: skip past packets that userspace has released.
909	 * (kring->nr_hwcur to head excluded),
910	 * and make the buffers available for reception.
911	 * As usual nm_i is the index in the netmap ring,
912	 * nic_i is the index in the NIC ring, and
913	 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size
914	 */
915	/* XXX not sure how this will work with multiple free lists */
916	nm_i = kring->nr_hwcur;
917	if (nm_i != head) {
918		nic_i = netmap_idx_k2n(kring, nm_i);
919		for (n = 0; nm_i != head; n++) {
920			struct netmap_slot *slot = &ring->slot[nm_i];
921			uint64_t paddr;
922			caddr_t vaddr;
923			void *addr = PNMB(na, slot, &paddr);
924
925			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
926				goto ring_reset;
927
928			vaddr = addr;
929			if (slot->flags & NS_BUF_CHANGED) {
930				/* buffer has changed, reload map */
931				netmap_reload_map(na, fl->ifl_ifdi->idi_tag, fl->ifl_sds[nic_i].ifsd_map, addr);
932				slot->flags &= ~NS_BUF_CHANGED;
933			}
934			/*
935			 * XXX we should be batching this operation - TODO
936			 */
937			ctx->isc_rxd_refill(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i, &paddr, &vaddr, 1, fl->ifl_buf_size);
938			bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_sds[nic_i].ifsd_map,
939			    BUS_DMASYNC_PREREAD);
940			nm_i = nm_next(nm_i, lim);
941			nic_i = nm_next(nic_i, lim);
942		}
943		kring->nr_hwcur = head;
944
945		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
946		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
947		/*
948		 * IMPORTANT: we must leave one free slot in the ring,
949		 * so move nic_i back by one unit
950		 */
951		nic_i = nm_prev(nic_i, lim);
952		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i);
953	}
954
955	return 0;
956
957ring_reset:
958	return netmap_ring_reinit(kring);
959}
960
961static int
962iflib_netmap_attach(if_ctx_t ctx)
963{
964	struct netmap_adapter na;
965	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
966
967	bzero(&na, sizeof(na));
968
969	na.ifp = ctx->ifc_ifp;
970	na.na_flags = NAF_BDG_MAYSLEEP;
971	MPASS(ctx->ifc_softc_ctx.isc_ntxqsets);
972	MPASS(ctx->ifc_softc_ctx.isc_nrxqsets);
973
974	na.num_tx_desc = scctx->isc_ntxd[0];
975	na.num_rx_desc = scctx->isc_nrxd[0];
976	na.nm_txsync = iflib_netmap_txsync;
977	na.nm_rxsync = iflib_netmap_rxsync;
978	na.nm_register = iflib_netmap_register;
979	na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
980	na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
981	return (netmap_attach(&na));
982}
983
984static void
985iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq)
986{
987	struct netmap_adapter *na = NA(ctx->ifc_ifp);
988	struct netmap_slot *slot;
989
990	slot = netmap_reset(na, NR_TX, txq->ift_id, 0);
991	if (slot == NULL)
992		return;
993
994	for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) {
995
996		/*
997		 * In netmap mode, set the map for the packet buffer.
998		 * NOTE: Some drivers (not this one) also need to set
999		 * the physical buffer address in the NIC ring.
1000		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
1001		 * netmap slot index, si
1002		 */
1003		int si = netmap_idx_n2k(&na->tx_rings[txq->ift_id], i);
1004		netmap_load_map(na, txq->ift_desc_tag, txq->ift_sds.ifsd_map[i], NMB(na, slot + si));
1005	}
1006}
1007static void
1008iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq)
1009{
1010	struct netmap_adapter *na = NA(ctx->ifc_ifp);
1011	struct netmap_slot *slot;
1012	iflib_rxsd_t sd;
1013	int nrxd;
1014
1015	slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0);
1016	if (slot == NULL)
1017		return;
1018	sd = rxq->ifr_fl[0].ifl_sds;
1019	nrxd = ctx->ifc_softc_ctx.isc_nrxd[0];
1020	for (int i = 0; i < nrxd; i++, sd++) {
1021			int sj = netmap_idx_n2k(&na->rx_rings[rxq->ifr_id], i);
1022			uint64_t paddr;
1023			void *addr;
1024			caddr_t vaddr;
1025
1026			vaddr = addr = PNMB(na, slot + sj, &paddr);
1027			netmap_load_map(na, rxq->ifr_fl[0].ifl_ifdi->idi_tag, sd->ifsd_map, addr);
1028			/* Update descriptor and the cached value */
1029			ctx->isc_rxd_refill(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, i, &paddr, &vaddr, 1, rxq->ifr_fl[0].ifl_buf_size);
1030	}
1031	/* preserve queue */
1032	if (ctx->ifc_ifp->if_capenable & IFCAP_NETMAP) {
1033		struct netmap_kring *kring = &na->rx_rings[rxq->ifr_id];
1034		int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring);
1035		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, t);
1036	} else
1037		ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, 0 /* fl_id */, nrxd-1);
1038}
1039
1040#define iflib_netmap_detach(ifp) netmap_detach(ifp)
1041
1042#else
1043#define iflib_netmap_txq_init(ctx, txq)
1044#define iflib_netmap_rxq_init(ctx, rxq)
1045#define iflib_netmap_detach(ifp)
1046
1047#define iflib_netmap_attach(ctx) (0)
1048#define netmap_rx_irq(ifp, qid, budget) (0)
1049
1050#endif
1051
1052#if defined(__i386__) || defined(__amd64__)
1053static __inline void
1054prefetch(void *x)
1055{
1056	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
1057}
1058#else
1059#define prefetch(x)
1060#endif
1061
1062static void
1063_iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
1064{
1065	if (err)
1066		return;
1067	*(bus_addr_t *) arg = segs[0].ds_addr;
1068}
1069
1070int
1071iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags)
1072{
1073	int err;
1074	if_shared_ctx_t sctx = ctx->ifc_sctx;
1075	device_t dev = ctx->ifc_dev;
1076
1077	KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized"));
1078
1079	err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
1080				sctx->isc_q_align, 0,	/* alignment, bounds */
1081				BUS_SPACE_MAXADDR,	/* lowaddr */
1082				BUS_SPACE_MAXADDR,	/* highaddr */
1083				NULL, NULL,		/* filter, filterarg */
1084				size,			/* maxsize */
1085				1,			/* nsegments */
1086				size,			/* maxsegsize */
1087				BUS_DMA_ALLOCNOW,	/* flags */
1088				NULL,			/* lockfunc */
1089				NULL,			/* lockarg */
1090				&dma->idi_tag);
1091	if (err) {
1092		device_printf(dev,
1093		    "%s: bus_dma_tag_create failed: %d\n",
1094		    __func__, err);
1095		goto fail_0;
1096	}
1097
1098	err = bus_dmamem_alloc(dma->idi_tag, (void**) &dma->idi_vaddr,
1099	    BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map);
1100	if (err) {
1101		device_printf(dev,
1102		    "%s: bus_dmamem_alloc(%ju) failed: %d\n",
1103		    __func__, (uintmax_t)size, err);
1104		goto fail_1;
1105	}
1106
1107	dma->idi_paddr = IF_BAD_DMA;
1108	err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr,
1109	    size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT);
1110	if (err || dma->idi_paddr == IF_BAD_DMA) {
1111		device_printf(dev,
1112		    "%s: bus_dmamap_load failed: %d\n",
1113		    __func__, err);
1114		goto fail_2;
1115	}
1116
1117	dma->idi_size = size;
1118	return (0);
1119
1120fail_2:
1121	bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
1122fail_1:
1123	bus_dma_tag_destroy(dma->idi_tag);
1124fail_0:
1125	dma->idi_tag = NULL;
1126
1127	return (err);
1128}
1129
1130int
1131iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count)
1132{
1133	int i, err;
1134	iflib_dma_info_t *dmaiter;
1135
1136	dmaiter = dmalist;
1137	for (i = 0; i < count; i++, dmaiter++) {
1138		if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0)
1139			break;
1140	}
1141	if (err)
1142		iflib_dma_free_multi(dmalist, i);
1143	return (err);
1144}
1145
1146void
1147iflib_dma_free(iflib_dma_info_t dma)
1148{
1149	if (dma->idi_tag == NULL)
1150		return;
1151	if (dma->idi_paddr != IF_BAD_DMA) {
1152		bus_dmamap_sync(dma->idi_tag, dma->idi_map,
1153		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1154		bus_dmamap_unload(dma->idi_tag, dma->idi_map);
1155		dma->idi_paddr = IF_BAD_DMA;
1156	}
1157	if (dma->idi_vaddr != NULL) {
1158		bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
1159		dma->idi_vaddr = NULL;
1160	}
1161	bus_dma_tag_destroy(dma->idi_tag);
1162	dma->idi_tag = NULL;
1163}
1164
1165void
1166iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count)
1167{
1168	int i;
1169	iflib_dma_info_t *dmaiter = dmalist;
1170
1171	for (i = 0; i < count; i++, dmaiter++)
1172		iflib_dma_free(*dmaiter);
1173}
1174
1175static int
1176iflib_fast_intr(void *arg)
1177{
1178	iflib_filter_info_t info = arg;
1179	struct grouptask *gtask = info->ifi_task;
1180
1181	DBG_COUNTER_INC(fast_intrs);
1182	if (info->ifi_filter != NULL && info->ifi_filter(info->ifi_filter_arg) == FILTER_HANDLED)
1183		return (FILTER_HANDLED);
1184
1185	GROUPTASK_ENQUEUE(gtask);
1186	return (FILTER_HANDLED);
1187}
1188
1189static int
1190_iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
1191	driver_filter_t filter, driver_intr_t handler, void *arg,
1192				 char *name)
1193{
1194	int rc;
1195	struct resource *res;
1196	void *tag;
1197	device_t dev = ctx->ifc_dev;
1198
1199	MPASS(rid < 512);
1200	irq->ii_rid = rid;
1201	res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &irq->ii_rid,
1202				     RF_SHAREABLE | RF_ACTIVE);
1203	if (res == NULL) {
1204		device_printf(dev,
1205		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
1206		return (ENOMEM);
1207	}
1208	irq->ii_res = res;
1209	KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL"));
1210	rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET,
1211						filter, handler, arg, &tag);
1212	if (rc != 0) {
1213		device_printf(dev,
1214		    "failed to setup interrupt for rid %d, name %s: %d\n",
1215					  rid, name ? name : "unknown", rc);
1216		return (rc);
1217	} else if (name)
1218		bus_describe_intr(dev, res, tag, "%s", name);
1219
1220	irq->ii_tag = tag;
1221	return (0);
1222}
1223
1224
1225/*********************************************************************
1226 *
1227 *  Allocate memory for tx_buffer structures. The tx_buffer stores all
1228 *  the information needed to transmit a packet on the wire. This is
1229 *  called only once at attach, setup is done every reset.
1230 *
1231 **********************************************************************/
1232
1233static int
1234iflib_txsd_alloc(iflib_txq_t txq)
1235{
1236	if_ctx_t ctx = txq->ift_ctx;
1237	if_shared_ctx_t sctx = ctx->ifc_sctx;
1238	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1239	device_t dev = ctx->ifc_dev;
1240	int err, nsegments, ntsosegments;
1241
1242	nsegments = scctx->isc_tx_nsegments;
1243	ntsosegments = scctx->isc_tx_tso_segments_max;
1244	MPASS(scctx->isc_ntxd[0] > 0);
1245	MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0);
1246	MPASS(nsegments > 0);
1247	MPASS(ntsosegments > 0);
1248	/*
1249	 * Setup DMA descriptor areas.
1250	 */
1251	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
1252			       1, 0,			/* alignment, bounds */
1253			       BUS_SPACE_MAXADDR,	/* lowaddr */
1254			       BUS_SPACE_MAXADDR,	/* highaddr */
1255			       NULL, NULL,		/* filter, filterarg */
1256			       sctx->isc_tx_maxsize,		/* maxsize */
1257			       nsegments,	/* nsegments */
1258			       sctx->isc_tx_maxsegsize,	/* maxsegsize */
1259			       0,			/* flags */
1260			       NULL,			/* lockfunc */
1261			       NULL,			/* lockfuncarg */
1262			       &txq->ift_desc_tag))) {
1263		device_printf(dev,"Unable to allocate TX DMA tag: %d\n", err);
1264		device_printf(dev,"maxsize: %ju nsegments: %d maxsegsize: %ju\n",
1265		    (uintmax_t)sctx->isc_tx_maxsize, nsegments, (uintmax_t)sctx->isc_tx_maxsegsize);
1266		goto fail;
1267	}
1268#ifdef IFLIB_DIAGNOSTICS
1269	device_printf(dev,"maxsize: %zd nsegments: %d maxsegsize: %zd\n",
1270		      sctx->isc_tx_maxsize, nsegments, sctx->isc_tx_maxsegsize);
1271
1272#endif
1273	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
1274			       1, 0,			/* alignment, bounds */
1275			       BUS_SPACE_MAXADDR,	/* lowaddr */
1276			       BUS_SPACE_MAXADDR,	/* highaddr */
1277			       NULL, NULL,		/* filter, filterarg */
1278			       scctx->isc_tx_tso_size_max,		/* maxsize */
1279			       ntsosegments,	/* nsegments */
1280			       scctx->isc_tx_tso_segsize_max,	/* maxsegsize */
1281			       0,			/* flags */
1282			       NULL,			/* lockfunc */
1283			       NULL,			/* lockfuncarg */
1284			       &txq->ift_tso_desc_tag))) {
1285		device_printf(dev,"Unable to allocate TX TSO DMA tag: %d\n", err);
1286
1287		goto fail;
1288	}
1289#ifdef IFLIB_DIAGNOSTICS
1290	device_printf(dev,"TSO maxsize: %d ntsosegments: %d maxsegsize: %d\n",
1291		      scctx->isc_tx_tso_size_max, ntsosegments,
1292		      scctx->isc_tx_tso_segsize_max);
1293#endif
1294	if (!(txq->ift_sds.ifsd_flags =
1295	    (uint8_t *) malloc(sizeof(uint8_t) *
1296	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1297		device_printf(dev, "Unable to allocate tx_buffer memory\n");
1298		err = ENOMEM;
1299		goto fail;
1300	}
1301	if (!(txq->ift_sds.ifsd_m =
1302	    (struct mbuf **) malloc(sizeof(struct mbuf *) *
1303	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1304		device_printf(dev, "Unable to allocate tx_buffer memory\n");
1305		err = ENOMEM;
1306		goto fail;
1307	}
1308
1309        /* Create the descriptor buffer dma maps */
1310#if defined(ACPI_DMAR) || (!(defined(__i386__) && !defined(__amd64__)))
1311	if ((ctx->ifc_flags & IFC_DMAR) == 0)
1312		return (0);
1313
1314	if (!(txq->ift_sds.ifsd_map =
1315	    (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1316		device_printf(dev, "Unable to allocate tx_buffer map memory\n");
1317		err = ENOMEM;
1318		goto fail;
1319	}
1320
1321	for (int i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) {
1322		err = bus_dmamap_create(txq->ift_desc_tag, 0, &txq->ift_sds.ifsd_map[i]);
1323		if (err != 0) {
1324			device_printf(dev, "Unable to create TX DMA map\n");
1325			goto fail;
1326		}
1327	}
1328#endif
1329	return (0);
1330fail:
1331	/* We free all, it handles case where we are in the middle */
1332	iflib_tx_structures_free(ctx);
1333	return (err);
1334}
1335
1336static void
1337iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i)
1338{
1339	bus_dmamap_t map;
1340
1341	map = NULL;
1342	if (txq->ift_sds.ifsd_map != NULL)
1343		map = txq->ift_sds.ifsd_map[i];
1344	if (map != NULL) {
1345		bus_dmamap_unload(txq->ift_desc_tag, map);
1346		bus_dmamap_destroy(txq->ift_desc_tag, map);
1347		txq->ift_sds.ifsd_map[i] = NULL;
1348	}
1349}
1350
1351static void
1352iflib_txq_destroy(iflib_txq_t txq)
1353{
1354	if_ctx_t ctx = txq->ift_ctx;
1355
1356	for (int i = 0; i < txq->ift_size; i++)
1357		iflib_txsd_destroy(ctx, txq, i);
1358	if (txq->ift_sds.ifsd_map != NULL) {
1359		free(txq->ift_sds.ifsd_map, M_IFLIB);
1360		txq->ift_sds.ifsd_map = NULL;
1361	}
1362	if (txq->ift_sds.ifsd_m != NULL) {
1363		free(txq->ift_sds.ifsd_m, M_IFLIB);
1364		txq->ift_sds.ifsd_m = NULL;
1365	}
1366	if (txq->ift_sds.ifsd_flags != NULL) {
1367		free(txq->ift_sds.ifsd_flags, M_IFLIB);
1368		txq->ift_sds.ifsd_flags = NULL;
1369	}
1370	if (txq->ift_desc_tag != NULL) {
1371		bus_dma_tag_destroy(txq->ift_desc_tag);
1372		txq->ift_desc_tag = NULL;
1373	}
1374	if (txq->ift_tso_desc_tag != NULL) {
1375		bus_dma_tag_destroy(txq->ift_tso_desc_tag);
1376		txq->ift_tso_desc_tag = NULL;
1377	}
1378}
1379
1380static void
1381iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i)
1382{
1383	struct mbuf **mp;
1384
1385	mp = &txq->ift_sds.ifsd_m[i];
1386	if (*mp == NULL)
1387		return;
1388
1389	if (txq->ift_sds.ifsd_map != NULL) {
1390		bus_dmamap_sync(txq->ift_desc_tag,
1391				txq->ift_sds.ifsd_map[i],
1392				BUS_DMASYNC_POSTWRITE);
1393		bus_dmamap_unload(txq->ift_desc_tag,
1394				  txq->ift_sds.ifsd_map[i]);
1395	}
1396	m_free(*mp);
1397	DBG_COUNTER_INC(tx_frees);
1398	*mp = NULL;
1399}
1400
1401static int
1402iflib_txq_setup(iflib_txq_t txq)
1403{
1404	if_ctx_t ctx = txq->ift_ctx;
1405	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1406	iflib_dma_info_t di;
1407	int i;
1408
1409    /* Set number of descriptors available */
1410	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
1411
1412	/* Reset indices */
1413	txq->ift_cidx_processed = txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
1414	txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset];
1415
1416	for (i = 0, di = txq->ift_ifdi; i < ctx->ifc_nhwtxqs; i++, di++)
1417		bzero((void *)di->idi_vaddr, di->idi_size);
1418
1419	IFDI_TXQ_SETUP(ctx, txq->ift_id);
1420	for (i = 0, di = txq->ift_ifdi; i < ctx->ifc_nhwtxqs; i++, di++)
1421		bus_dmamap_sync(di->idi_tag, di->idi_map,
1422						BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1423	return (0);
1424}
1425
1426/*********************************************************************
1427 *
1428 *  Allocate memory for rx_buffer structures. Since we use one
1429 *  rx_buffer per received packet, the maximum number of rx_buffer's
1430 *  that we'll need is equal to the number of receive descriptors
1431 *  that we've allocated.
1432 *
1433 **********************************************************************/
1434static int
1435iflib_rxsd_alloc(iflib_rxq_t rxq)
1436{
1437	if_ctx_t ctx = rxq->ifr_ctx;
1438	if_shared_ctx_t sctx = ctx->ifc_sctx;
1439	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1440	device_t dev = ctx->ifc_dev;
1441	iflib_fl_t fl;
1442	iflib_rxsd_t	rxsd;
1443	int			err;
1444
1445	MPASS(scctx->isc_nrxd[0] > 0);
1446	MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0);
1447
1448	fl = rxq->ifr_fl;
1449	for (int i = 0; i <  rxq->ifr_nfl; i++, fl++) {
1450		fl->ifl_sds = malloc(sizeof(struct iflib_sw_rx_desc) *
1451		    scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB,
1452		    M_WAITOK | M_ZERO);
1453		if (fl->ifl_sds == NULL) {
1454			device_printf(dev, "Unable to allocate rx sw desc memory\n");
1455			return (ENOMEM);
1456		}
1457		fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */
1458		err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
1459					 1, 0,			/* alignment, bounds */
1460					 BUS_SPACE_MAXADDR,	/* lowaddr */
1461					 BUS_SPACE_MAXADDR,	/* highaddr */
1462					 NULL, NULL,		/* filter, filterarg */
1463					 sctx->isc_rx_maxsize,	/* maxsize */
1464					 sctx->isc_rx_nsegments,	/* nsegments */
1465					 sctx->isc_rx_maxsegsize,	/* maxsegsize */
1466					 0,			/* flags */
1467					 NULL,			/* lockfunc */
1468					 NULL,			/* lockarg */
1469					 &fl->ifl_desc_tag);
1470		if (err) {
1471			device_printf(dev, "%s: bus_dma_tag_create failed %d\n",
1472				__func__, err);
1473			goto fail;
1474		}
1475
1476		rxsd = fl->ifl_sds;
1477		for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++, rxsd++) {
1478			err = bus_dmamap_create(fl->ifl_desc_tag, 0, &rxsd->ifsd_map);
1479			if (err) {
1480				device_printf(dev, "%s: bus_dmamap_create failed: %d\n",
1481					__func__, err);
1482				goto fail;
1483			}
1484		}
1485	}
1486	return (0);
1487
1488fail:
1489	iflib_rx_structures_free(ctx);
1490	return (err);
1491}
1492
1493
1494/*
1495 * Internal service routines
1496 */
1497
1498struct rxq_refill_cb_arg {
1499	int               error;
1500	bus_dma_segment_t seg;
1501	int               nseg;
1502};
1503
1504static void
1505_rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
1506{
1507	struct rxq_refill_cb_arg *cb_arg = arg;
1508
1509	cb_arg->error = error;
1510	cb_arg->seg = segs[0];
1511	cb_arg->nseg = nseg;
1512}
1513
1514
1515#ifdef ACPI_DMAR
1516#define IS_DMAR(ctx) (ctx->ifc_flags & IFC_DMAR)
1517#else
1518#define IS_DMAR(ctx) (0)
1519#endif
1520
1521/**
1522 *	rxq_refill - refill an rxq  free-buffer list
1523 *	@ctx: the iflib context
1524 *	@rxq: the free-list to refill
1525 *	@n: the number of new buffers to allocate
1526 *
1527 *	(Re)populate an rxq free-buffer list with up to @n new packet buffers.
1528 *	The caller must assure that @n does not exceed the queue's capacity.
1529 */
1530static void
1531_iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count)
1532{
1533	struct mbuf *m;
1534	int pidx = fl->ifl_pidx;
1535	iflib_rxsd_t rxsd = &fl->ifl_sds[pidx];
1536	caddr_t cl;
1537	int n, i = 0;
1538	uint64_t bus_addr;
1539	int err;
1540
1541	n  = count;
1542	MPASS(n > 0);
1543	MPASS(fl->ifl_credits + n <= fl->ifl_size);
1544
1545	if (pidx < fl->ifl_cidx)
1546		MPASS(pidx + n <= fl->ifl_cidx);
1547	if (pidx == fl->ifl_cidx && (fl->ifl_credits < fl->ifl_size))
1548		MPASS(fl->ifl_gen == 0);
1549	if (pidx > fl->ifl_cidx)
1550		MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx);
1551
1552	DBG_COUNTER_INC(fl_refills);
1553	if (n > 8)
1554		DBG_COUNTER_INC(fl_refills_large);
1555
1556	while (n--) {
1557		/*
1558		 * We allocate an uninitialized mbuf + cluster, mbuf is
1559		 * initialized after rx.
1560		 *
1561		 * If the cluster is still set then we know a minimum sized packet was received
1562		 */
1563		if ((cl = rxsd->ifsd_cl) == NULL) {
1564			if ((cl = rxsd->ifsd_cl = m_cljget(NULL, M_NOWAIT, fl->ifl_buf_size)) == NULL)
1565				break;
1566#if MEMORY_LOGGING
1567			fl->ifl_cl_enqueued++;
1568#endif
1569		}
1570		if ((m = m_gethdr(M_NOWAIT, MT_NOINIT)) == NULL) {
1571			break;
1572		}
1573#if MEMORY_LOGGING
1574		fl->ifl_m_enqueued++;
1575#endif
1576
1577		DBG_COUNTER_INC(rx_allocs);
1578#ifdef notyet
1579		if ((rxsd->ifsd_flags & RX_SW_DESC_MAP_CREATED) == 0) {
1580			int err;
1581
1582			if ((err = bus_dmamap_create(fl->ifl_ifdi->idi_tag, 0, &rxsd->ifsd_map))) {
1583				log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
1584				uma_zfree(fl->ifl_zone, cl);
1585				n = 0;
1586				goto done;
1587			}
1588			rxsd->ifsd_flags |= RX_SW_DESC_MAP_CREATED;
1589		}
1590#endif
1591#if defined(__i386__) || defined(__amd64__)
1592		if (!IS_DMAR(ctx)) {
1593			bus_addr = pmap_kextract((vm_offset_t)cl);
1594		} else
1595#endif
1596		{
1597			struct rxq_refill_cb_arg cb_arg;
1598			iflib_rxq_t q;
1599
1600			cb_arg.error = 0;
1601			q = fl->ifl_rxq;
1602			err = bus_dmamap_load(fl->ifl_desc_tag, rxsd->ifsd_map,
1603		         cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg, 0);
1604
1605			if (err != 0 || cb_arg.error) {
1606				/*
1607				 * !zone_pack ?
1608				 */
1609				if (fl->ifl_zone == zone_pack)
1610					uma_zfree(fl->ifl_zone, cl);
1611				m_free(m);
1612				n = 0;
1613				goto done;
1614			}
1615			bus_addr = cb_arg.seg.ds_addr;
1616		}
1617		rxsd->ifsd_flags |= RX_SW_DESC_INUSE;
1618
1619		MPASS(rxsd->ifsd_m == NULL);
1620		rxsd->ifsd_cl = cl;
1621		rxsd->ifsd_m = m;
1622		fl->ifl_bus_addrs[i] = bus_addr;
1623		fl->ifl_vm_addrs[i] = cl;
1624		rxsd++;
1625		fl->ifl_credits++;
1626		i++;
1627		MPASS(fl->ifl_credits <= fl->ifl_size);
1628		if (++fl->ifl_pidx == fl->ifl_size) {
1629			fl->ifl_pidx = 0;
1630			fl->ifl_gen = 1;
1631			rxsd = fl->ifl_sds;
1632		}
1633		if (n == 0 || i == IFLIB_MAX_RX_REFRESH) {
1634			ctx->isc_rxd_refill(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx,
1635								 fl->ifl_bus_addrs, fl->ifl_vm_addrs, i, fl->ifl_buf_size);
1636			i = 0;
1637			pidx = fl->ifl_pidx;
1638		}
1639	}
1640done:
1641	DBG_COUNTER_INC(rxd_flush);
1642	if (fl->ifl_pidx == 0)
1643		pidx = fl->ifl_size - 1;
1644	else
1645		pidx = fl->ifl_pidx - 1;
1646	ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id, fl->ifl_id, pidx);
1647}
1648
1649static __inline void
1650__iflib_fl_refill_lt(if_ctx_t ctx, iflib_fl_t fl, int max)
1651{
1652	/* we avoid allowing pidx to catch up with cidx as it confuses ixl */
1653	int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1;
1654#ifdef INVARIANTS
1655	int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1;
1656#endif
1657
1658	MPASS(fl->ifl_credits <= fl->ifl_size);
1659	MPASS(reclaimable == delta);
1660
1661	if (reclaimable > 0)
1662		_iflib_fl_refill(ctx, fl, min(max, reclaimable));
1663}
1664
1665static void
1666iflib_fl_bufs_free(iflib_fl_t fl)
1667{
1668	iflib_dma_info_t idi = fl->ifl_ifdi;
1669	uint32_t i;
1670
1671	for (i = 0; i < fl->ifl_size; i++) {
1672		iflib_rxsd_t d = &fl->ifl_sds[i];
1673
1674		if (d->ifsd_flags & RX_SW_DESC_INUSE) {
1675			bus_dmamap_unload(fl->ifl_desc_tag, d->ifsd_map);
1676			bus_dmamap_destroy(fl->ifl_desc_tag, d->ifsd_map);
1677			if (d->ifsd_m != NULL) {
1678				m_init(d->ifsd_m, M_NOWAIT, MT_DATA, 0);
1679				uma_zfree(zone_mbuf, d->ifsd_m);
1680			}
1681			if (d->ifsd_cl != NULL)
1682				uma_zfree(fl->ifl_zone, d->ifsd_cl);
1683			d->ifsd_flags = 0;
1684		} else {
1685			MPASS(d->ifsd_cl == NULL);
1686			MPASS(d->ifsd_m == NULL);
1687		}
1688#if MEMORY_LOGGING
1689		fl->ifl_m_dequeued++;
1690		fl->ifl_cl_dequeued++;
1691#endif
1692		d->ifsd_cl = NULL;
1693		d->ifsd_m = NULL;
1694	}
1695	/*
1696	 * Reset free list values
1697	 */
1698	fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = 0;;
1699	bzero(idi->idi_vaddr, idi->idi_size);
1700}
1701
1702/*********************************************************************
1703 *
1704 *  Initialize a receive ring and its buffers.
1705 *
1706 **********************************************************************/
1707static int
1708iflib_fl_setup(iflib_fl_t fl)
1709{
1710	iflib_rxq_t rxq = fl->ifl_rxq;
1711	if_ctx_t ctx = rxq->ifr_ctx;
1712	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
1713
1714	/*
1715	** Free current RX buffer structs and their mbufs
1716	*/
1717	iflib_fl_bufs_free(fl);
1718	/* Now replenish the mbufs */
1719	MPASS(fl->ifl_credits == 0);
1720	/*
1721	 * XXX don't set the max_frame_size to larger
1722	 * than the hardware can handle
1723	 */
1724	if (sctx->isc_max_frame_size <= 2048)
1725		fl->ifl_buf_size = MCLBYTES;
1726	else if (sctx->isc_max_frame_size <= 4096)
1727		fl->ifl_buf_size = MJUMPAGESIZE;
1728	else if (sctx->isc_max_frame_size <= 9216)
1729		fl->ifl_buf_size = MJUM9BYTES;
1730	else
1731		fl->ifl_buf_size = MJUM16BYTES;
1732	if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size)
1733		ctx->ifc_max_fl_buf_size = fl->ifl_buf_size;
1734	fl->ifl_cltype = m_gettype(fl->ifl_buf_size);
1735	fl->ifl_zone = m_getzone(fl->ifl_buf_size);
1736
1737
1738	/* avoid pre-allocating zillions of clusters to an idle card
1739	 * potentially speeding up attach
1740	 */
1741	_iflib_fl_refill(ctx, fl, min(128, fl->ifl_size));
1742	MPASS(min(128, fl->ifl_size) == fl->ifl_credits);
1743	if (min(128, fl->ifl_size) != fl->ifl_credits)
1744		return (ENOBUFS);
1745	/*
1746	 * handle failure
1747	 */
1748	MPASS(rxq != NULL);
1749	MPASS(fl->ifl_ifdi != NULL);
1750	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
1751	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1752	return (0);
1753}
1754
1755/*********************************************************************
1756 *
1757 *  Free receive ring data structures
1758 *
1759 **********************************************************************/
1760static void
1761iflib_rx_sds_free(iflib_rxq_t rxq)
1762{
1763	iflib_fl_t fl;
1764	int i;
1765
1766	if (rxq->ifr_fl != NULL) {
1767		for (i = 0; i < rxq->ifr_nfl; i++) {
1768			fl = &rxq->ifr_fl[i];
1769			if (fl->ifl_desc_tag != NULL) {
1770				bus_dma_tag_destroy(fl->ifl_desc_tag);
1771				fl->ifl_desc_tag = NULL;
1772			}
1773		}
1774		if (rxq->ifr_fl->ifl_sds != NULL)
1775			free(rxq->ifr_fl->ifl_sds, M_IFLIB);
1776
1777		free(rxq->ifr_fl, M_IFLIB);
1778		rxq->ifr_fl = NULL;
1779		rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0;
1780	}
1781}
1782
1783/*
1784 * MI independent logic
1785 *
1786 */
1787static void
1788iflib_timer(void *arg)
1789{
1790	iflib_txq_t txq = arg;
1791	if_ctx_t ctx = txq->ift_ctx;
1792	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1793
1794	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
1795		return;
1796	/*
1797	** Check on the state of the TX queue(s), this
1798	** can be done without the lock because its RO
1799	** and the HUNG state will be static if set.
1800	*/
1801	IFDI_TIMER(ctx, txq->ift_id);
1802	if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) &&
1803		(ctx->ifc_pause_frames == 0))
1804		goto hung;
1805
1806	if (TXQ_AVAIL(txq) <= 2*scctx->isc_tx_nsegments ||
1807	    ifmp_ring_is_stalled(txq->ift_br[0]))
1808		GROUPTASK_ENQUEUE(&txq->ift_task);
1809
1810	ctx->ifc_pause_frames = 0;
1811	if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)
1812		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu);
1813	return;
1814hung:
1815	CTX_LOCK(ctx);
1816	if_setdrvflagbits(ctx->ifc_ifp, 0, IFF_DRV_RUNNING);
1817	device_printf(ctx->ifc_dev,  "TX(%d) desc avail = %d, pidx = %d\n",
1818				  txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx);
1819
1820	IFDI_WATCHDOG_RESET(ctx);
1821	ctx->ifc_watchdog_events++;
1822	ctx->ifc_pause_frames = 0;
1823
1824	iflib_init_locked(ctx);
1825	CTX_UNLOCK(ctx);
1826}
1827
1828static void
1829iflib_init_locked(if_ctx_t ctx)
1830{
1831	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
1832	if_t ifp = ctx->ifc_ifp;
1833	iflib_fl_t fl;
1834	iflib_txq_t txq;
1835	iflib_rxq_t rxq;
1836	int i, j;
1837
1838
1839	if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
1840	IFDI_INTR_DISABLE(ctx);
1841
1842	/* Set hardware offload abilities */
1843	if_clearhwassist(ifp);
1844	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
1845		if_sethwassistbits(ifp, CSUM_IP | CSUM_TCP | CSUM_UDP, 0);
1846	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
1847		if_sethwassistbits(ifp,  (CSUM_TCP_IPV6 | CSUM_UDP_IPV6), 0);
1848	if (if_getcapenable(ifp) & IFCAP_TSO4)
1849		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
1850	if (if_getcapenable(ifp) & IFCAP_TSO6)
1851		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
1852
1853	for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) {
1854		CALLOUT_LOCK(txq);
1855		callout_stop(&txq->ift_timer);
1856		callout_stop(&txq->ift_db_check);
1857		CALLOUT_UNLOCK(txq);
1858		iflib_netmap_txq_init(ctx, txq);
1859	}
1860	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
1861		iflib_netmap_rxq_init(ctx, rxq);
1862	}
1863#ifdef INVARIANTS
1864	i = if_getdrvflags(ifp);
1865#endif
1866	IFDI_INIT(ctx);
1867	MPASS(if_getdrvflags(ifp) == i);
1868	for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
1869		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
1870			if (iflib_fl_setup(fl)) {
1871				device_printf(ctx->ifc_dev, "freelist setup failed - check cluster settings\n");
1872				goto done;
1873			}
1874		}
1875	}
1876	done:
1877	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
1878	IFDI_INTR_ENABLE(ctx);
1879	txq = ctx->ifc_txqs;
1880	for (i = 0; i < sctx->isc_ntxqsets; i++, txq++)
1881		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq,
1882			txq->ift_timer.c_cpu);
1883}
1884
1885static int
1886iflib_media_change(if_t ifp)
1887{
1888	if_ctx_t ctx = if_getsoftc(ifp);
1889	int err;
1890
1891	CTX_LOCK(ctx);
1892	if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0)
1893		iflib_init_locked(ctx);
1894	CTX_UNLOCK(ctx);
1895	return (err);
1896}
1897
1898static void
1899iflib_media_status(if_t ifp, struct ifmediareq *ifmr)
1900{
1901	if_ctx_t ctx = if_getsoftc(ifp);
1902
1903	CTX_LOCK(ctx);
1904	IFDI_UPDATE_ADMIN_STATUS(ctx);
1905	IFDI_MEDIA_STATUS(ctx, ifmr);
1906	CTX_UNLOCK(ctx);
1907}
1908
1909static void
1910iflib_stop(if_ctx_t ctx)
1911{
1912	iflib_txq_t txq = ctx->ifc_txqs;
1913	iflib_rxq_t rxq = ctx->ifc_rxqs;
1914	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1915	iflib_dma_info_t di;
1916	iflib_fl_t fl;
1917	int i, j;
1918
1919	/* Tell the stack that the interface is no longer active */
1920	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
1921
1922	IFDI_INTR_DISABLE(ctx);
1923	msleep(ctx, &ctx->ifc_mtx, PUSER, "iflib_init", hz);
1924
1925	/* Wait for current tx queue users to exit to disarm watchdog timer. */
1926	for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) {
1927		/* make sure all transmitters have completed before proceeding XXX */
1928
1929		/* clean any enqueued buffers */
1930		iflib_txq_check_drain(txq, 0);
1931		/* Free any existing tx buffers. */
1932		for (j = 0; j < txq->ift_size; j++) {
1933			iflib_txsd_free(ctx, txq, j);
1934		}
1935		txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0;
1936		txq->ift_in_use = txq->ift_cidx = txq->ift_pidx = txq->ift_no_desc_avail = 0;
1937		txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0;
1938		txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0;
1939		txq->ift_pullups = 0;
1940		ifmp_ring_reset_stats(txq->ift_br[0]);
1941		for (j = 0, di = txq->ift_ifdi; j < ctx->ifc_nhwtxqs; j++, di++)
1942			bzero((void *)di->idi_vaddr, di->idi_size);
1943	}
1944	for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) {
1945		/* make sure all transmitters have completed before proceeding XXX */
1946
1947		for (j = 0, di = txq->ift_ifdi; j < ctx->ifc_nhwrxqs; j++, di++)
1948			bzero((void *)di->idi_vaddr, di->idi_size);
1949		/* also resets the free lists pidx/cidx */
1950		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
1951			iflib_fl_bufs_free(fl);
1952	}
1953	IFDI_STOP(ctx);
1954}
1955
1956static iflib_rxsd_t
1957rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int *cltype, int unload)
1958{
1959	int flid, cidx;
1960	iflib_rxsd_t sd;
1961	iflib_fl_t fl;
1962	iflib_dma_info_t di;
1963
1964	flid = irf->irf_flid;
1965	cidx = irf->irf_idx;
1966	fl = &rxq->ifr_fl[flid];
1967	fl->ifl_credits--;
1968#if MEMORY_LOGGING
1969	fl->ifl_m_dequeued++;
1970	if (cltype)
1971		fl->ifl_cl_dequeued++;
1972#endif
1973	sd = &fl->ifl_sds[cidx];
1974	di = fl->ifl_ifdi;
1975	bus_dmamap_sync(di->idi_tag, di->idi_map,
1976			BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1977
1978	/* not valid assert if bxe really does SGE from non-contiguous elements */
1979	MPASS(fl->ifl_cidx == cidx);
1980	if (unload)
1981		bus_dmamap_unload(fl->ifl_desc_tag, sd->ifsd_map);
1982
1983	if (__predict_false(++fl->ifl_cidx == fl->ifl_size)) {
1984		fl->ifl_cidx = 0;
1985		fl->ifl_gen = 0;
1986	}
1987	/* YES ick */
1988	if (cltype)
1989		*cltype = fl->ifl_cltype;
1990	return (sd);
1991}
1992
1993static struct mbuf *
1994assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri)
1995{
1996	int i, padlen , flags, cltype;
1997	struct mbuf *m, *mh, *mt;
1998	iflib_rxsd_t sd;
1999	caddr_t cl;
2000
2001	i = 0;
2002	mh = NULL;
2003	do {
2004		sd = rxd_frag_to_sd(rxq, &ri->iri_frags[i], &cltype, TRUE);
2005
2006		MPASS(sd->ifsd_cl != NULL);
2007		MPASS(sd->ifsd_m != NULL);
2008
2009		/* Don't include zero-length frags */
2010		if (ri->iri_frags[i].irf_len == 0) {
2011			/* XXX we can save the cluster here, but not the mbuf */
2012			m_init(sd->ifsd_m, M_NOWAIT, MT_DATA, 0);
2013			m_free(sd->ifsd_m);
2014			sd->ifsd_m = NULL;
2015			continue;
2016		}
2017
2018		m = sd->ifsd_m;
2019		if (mh == NULL) {
2020			flags = M_PKTHDR|M_EXT;
2021			mh = mt = m;
2022			padlen = ri->iri_pad;
2023		} else {
2024			flags = M_EXT;
2025			mt->m_next = m;
2026			mt = m;
2027			/* assuming padding is only on the first fragment */
2028			padlen = 0;
2029		}
2030		sd->ifsd_m = NULL;
2031		cl = sd->ifsd_cl;
2032		sd->ifsd_cl = NULL;
2033
2034		/* Can these two be made one ? */
2035		m_init(m, M_NOWAIT, MT_DATA, flags);
2036		m_cljset(m, cl, cltype);
2037		/*
2038		 * These must follow m_init and m_cljset
2039		 */
2040		m->m_data += padlen;
2041		ri->iri_len -= padlen;
2042		m->m_len = ri->iri_frags[i].irf_len;
2043	} while (++i < ri->iri_nfrags);
2044
2045	return (mh);
2046}
2047
2048/*
2049 * Process one software descriptor
2050 */
2051static struct mbuf *
2052iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri)
2053{
2054	struct mbuf *m;
2055	iflib_rxsd_t sd;
2056
2057	/* should I merge this back in now that the two paths are basically duplicated? */
2058	if (ri->iri_nfrags == 1 &&
2059	    ri->iri_frags[0].irf_len <= MIN(IFLIB_RX_COPY_THRESH, MHLEN)) {
2060		sd = rxd_frag_to_sd(rxq, &ri->iri_frags[0], NULL, FALSE);
2061		m = sd->ifsd_m;
2062		sd->ifsd_m = NULL;
2063		m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
2064		memcpy(m->m_data, sd->ifsd_cl, ri->iri_len);
2065		m->m_len = ri->iri_frags[0].irf_len;
2066       } else {
2067		m = assemble_segments(rxq, ri);
2068	}
2069	m->m_pkthdr.len = ri->iri_len;
2070	m->m_pkthdr.rcvif = ri->iri_ifp;
2071	m->m_flags |= ri->iri_flags;
2072	m->m_pkthdr.ether_vtag = ri->iri_vtag;
2073	m->m_pkthdr.flowid = ri->iri_flowid;
2074	M_HASHTYPE_SET(m, ri->iri_rsstype);
2075	m->m_pkthdr.csum_flags = ri->iri_csum_flags;
2076	m->m_pkthdr.csum_data = ri->iri_csum_data;
2077	return (m);
2078}
2079
2080static bool
2081iflib_rxeof(iflib_rxq_t rxq, int budget)
2082{
2083	if_ctx_t ctx = rxq->ifr_ctx;
2084	if_shared_ctx_t sctx = ctx->ifc_sctx;
2085	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2086	int avail, i;
2087	uint16_t *cidxp;
2088	struct if_rxd_info ri;
2089	int err, budget_left, rx_bytes, rx_pkts;
2090	iflib_fl_t fl;
2091	struct ifnet *ifp;
2092	int lro_enabled;
2093	/*
2094	 * XXX early demux data packets so that if_input processing only handles
2095	 * acks in interrupt context
2096	 */
2097	struct mbuf *m, *mh, *mt;
2098
2099	if (netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &budget)) {
2100		return (FALSE);
2101	}
2102
2103	mh = mt = NULL;
2104	MPASS(budget > 0);
2105	rx_pkts	= rx_bytes = 0;
2106	if (sctx->isc_flags & IFLIB_HAS_RXCQ)
2107		cidxp = &rxq->ifr_cq_cidx;
2108	else
2109		cidxp = &rxq->ifr_fl[0].ifl_cidx;
2110	if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) {
2111		for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
2112			__iflib_fl_refill_lt(ctx, fl, budget + 8);
2113		DBG_COUNTER_INC(rx_unavail);
2114		return (false);
2115	}
2116
2117	for (budget_left = budget; (budget_left > 0) && (avail > 0); budget_left--, avail--) {
2118		if (__predict_false(!CTX_ACTIVE(ctx))) {
2119			DBG_COUNTER_INC(rx_ctx_inactive);
2120			break;
2121		}
2122		/*
2123		 * Reset client set fields to their default values
2124		 */
2125		bzero(&ri, sizeof(ri));
2126		ri.iri_qsidx = rxq->ifr_id;
2127		ri.iri_cidx = *cidxp;
2128		ri.iri_ifp = ctx->ifc_ifp;
2129		ri.iri_frags = rxq->ifr_frags;
2130		err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
2131
2132		/* in lieu of handling correctly - make sure it isn't being unhandled */
2133		MPASS(err == 0);
2134		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
2135			*cidxp = ri.iri_cidx;
2136			/* Update our consumer index */
2137			while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0]) {
2138				rxq->ifr_cq_cidx -= scctx->isc_nrxd[0];
2139				rxq->ifr_cq_gen = 0;
2140			}
2141			/* was this only a completion queue message? */
2142			if (__predict_false(ri.iri_nfrags == 0))
2143				continue;
2144		}
2145		MPASS(ri.iri_nfrags != 0);
2146		MPASS(ri.iri_len != 0);
2147
2148		/* will advance the cidx on the corresponding free lists */
2149		m = iflib_rxd_pkt_get(rxq, &ri);
2150		if (avail == 0 && budget_left)
2151			avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left);
2152
2153		if (__predict_false(m == NULL)) {
2154			DBG_COUNTER_INC(rx_mbuf_null);
2155			continue;
2156		}
2157		/* imm_pkt: -- cxgb */
2158		if (mh == NULL)
2159			mh = mt = m;
2160		else {
2161			mt->m_nextpkt = m;
2162			mt = m;
2163		}
2164	}
2165	/* make sure that we can refill faster than drain */
2166	for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
2167		__iflib_fl_refill_lt(ctx, fl, budget + 8);
2168
2169	ifp = ctx->ifc_ifp;
2170	lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO);
2171	while (mh != NULL) {
2172		m = mh;
2173		mh = mh->m_nextpkt;
2174		m->m_nextpkt = NULL;
2175		rx_bytes += m->m_pkthdr.len;
2176		rx_pkts++;
2177#if defined(INET6) || defined(INET)
2178		if (lro_enabled && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
2179			continue;
2180#endif
2181		DBG_COUNTER_INC(rx_if_input);
2182		ifp->if_input(ifp, m);
2183	}
2184
2185	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
2186	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
2187
2188	/*
2189	 * Flush any outstanding LRO work
2190	 */
2191#if defined(INET6) || defined(INET)
2192	tcp_lro_flush_all(&rxq->ifr_lc);
2193#endif
2194	if (avail)
2195		return true;
2196	return (iflib_rxd_avail(ctx, rxq, *cidxp, 1));
2197}
2198
2199#define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags)
2200#define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG)
2201#define TXQ_MAX_DB_DEFERRED(size) (size >> 5)
2202#define TXQ_MAX_DB_CONSUMED(size) (size >> 4)
2203
2204static __inline void
2205iflib_txd_db_check(if_ctx_t ctx, iflib_txq_t txq, int ring)
2206{
2207	uint32_t dbval;
2208
2209	if (ring || txq->ift_db_pending >=
2210	    TXQ_MAX_DB_DEFERRED(txq->ift_size)) {
2211
2212		/* the lock will only ever be contended in the !min_latency case */
2213		if (!TXDB_TRYLOCK(txq))
2214			return;
2215		dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx;
2216		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval);
2217		txq->ift_db_pending = txq->ift_npending = 0;
2218		TXDB_UNLOCK(txq);
2219	}
2220}
2221
2222static void
2223iflib_txd_deferred_db_check(void * arg)
2224{
2225	iflib_txq_t txq = arg;
2226
2227	/* simple non-zero boolean so use bitwise OR */
2228	if ((txq->ift_db_pending | txq->ift_npending) &&
2229	    txq->ift_db_pending >= txq->ift_db_pending_queued)
2230		iflib_txd_db_check(txq->ift_ctx, txq, TRUE);
2231	txq->ift_db_pending_queued = 0;
2232	if (ifmp_ring_is_stalled(txq->ift_br[0]))
2233		iflib_txq_check_drain(txq, 4);
2234}
2235
2236#ifdef PKT_DEBUG
2237static void
2238print_pkt(if_pkt_info_t pi)
2239{
2240	printf("pi len:  %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n",
2241	       pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx);
2242	printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n",
2243	       pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag);
2244	printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n",
2245	       pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto);
2246}
2247#endif
2248
2249#define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO)
2250#define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO)
2251
2252static int
2253iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
2254{
2255	struct ether_vlan_header *eh;
2256	struct mbuf *m, *n;
2257
2258	n = m = *mp;
2259	/*
2260	 * Determine where frame payload starts.
2261	 * Jump over vlan headers if already present,
2262	 * helpful for QinQ too.
2263	 */
2264	if (__predict_false(m->m_len < sizeof(*eh))) {
2265		txq->ift_pullups++;
2266		if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL))
2267			return (ENOMEM);
2268	}
2269	eh = mtod(m, struct ether_vlan_header *);
2270	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
2271		pi->ipi_etype = ntohs(eh->evl_proto);
2272		pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
2273	} else {
2274		pi->ipi_etype = ntohs(eh->evl_encap_proto);
2275		pi->ipi_ehdrlen = ETHER_HDR_LEN;
2276	}
2277
2278	switch (pi->ipi_etype) {
2279#ifdef INET
2280	case ETHERTYPE_IP:
2281	{
2282		struct ip *ip = NULL;
2283		struct tcphdr *th = NULL;
2284		int minthlen;
2285
2286		minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th));
2287		if (__predict_false(m->m_len < minthlen)) {
2288			/*
2289			 * if this code bloat is causing too much of a hit
2290			 * move it to a separate function and mark it noinline
2291			 */
2292			if (m->m_len == pi->ipi_ehdrlen) {
2293				n = m->m_next;
2294				MPASS(n);
2295				if (n->m_len >= sizeof(*ip))  {
2296					ip = (struct ip *)n->m_data;
2297					if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th))
2298						th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
2299				} else {
2300					txq->ift_pullups++;
2301					if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
2302						return (ENOMEM);
2303					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
2304				}
2305			} else {
2306				txq->ift_pullups++;
2307				if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
2308					return (ENOMEM);
2309				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
2310				if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
2311					th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
2312			}
2313		} else {
2314			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
2315			if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
2316				th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
2317		}
2318		pi->ipi_ip_hlen = ip->ip_hl << 2;
2319		pi->ipi_ipproto = ip->ip_p;
2320		pi->ipi_flags |= IPI_TX_IPV4;
2321
2322		if (pi->ipi_csum_flags & CSUM_IP)
2323                       ip->ip_sum = 0;
2324
2325		if (pi->ipi_ipproto == IPPROTO_TCP) {
2326			if (__predict_false(th == NULL)) {
2327				txq->ift_pullups++;
2328				if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
2329					return (ENOMEM);
2330				th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
2331			}
2332			pi->ipi_tcp_hflags = th->th_flags;
2333			pi->ipi_tcp_hlen = th->th_off << 2;
2334			pi->ipi_tcp_seq = th->th_seq;
2335		}
2336		if (IS_TSO4(pi)) {
2337			if (__predict_false(ip->ip_p != IPPROTO_TCP))
2338				return (ENXIO);
2339			th->th_sum = in_pseudo(ip->ip_src.s_addr,
2340					       ip->ip_dst.s_addr, htons(IPPROTO_TCP));
2341			pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
2342		}
2343		break;
2344	}
2345#endif
2346#ifdef INET6
2347	case ETHERTYPE_IPV6:
2348	{
2349		struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
2350		struct tcphdr *th;
2351		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
2352
2353		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
2354			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
2355				return (ENOMEM);
2356		}
2357		th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen);
2358
2359		/* XXX-BZ this will go badly in case of ext hdrs. */
2360		pi->ipi_ipproto = ip6->ip6_nxt;
2361		pi->ipi_flags |= IPI_TX_IPV6;
2362
2363		if (pi->ipi_ipproto == IPPROTO_TCP) {
2364			if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
2365				if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
2366					return (ENOMEM);
2367			}
2368			pi->ipi_tcp_hflags = th->th_flags;
2369			pi->ipi_tcp_hlen = th->th_off << 2;
2370		}
2371		if (IS_TSO6(pi)) {
2372
2373			if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP))
2374				return (ENXIO);
2375			/*
2376			 * The corresponding flag is set by the stack in the IPv4
2377			 * TSO case, but not in IPv6 (at least in FreeBSD 10.2).
2378			 * So, set it here because the rest of the flow requires it.
2379			 */
2380			pi->ipi_csum_flags |= CSUM_TCP_IPV6;
2381			th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
2382			pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
2383		}
2384		break;
2385	}
2386#endif
2387	default:
2388		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
2389		pi->ipi_ip_hlen = 0;
2390		break;
2391	}
2392	*mp = m;
2393	return (0);
2394}
2395
2396
2397static  __noinline  struct mbuf *
2398collapse_pkthdr(struct mbuf *m0)
2399{
2400	struct mbuf *m, *m_next, *tmp;
2401
2402	m = m0;
2403	m_next = m->m_next;
2404	while (m_next != NULL && m_next->m_len == 0) {
2405		m = m_next;
2406		m->m_next = NULL;
2407		m_free(m);
2408		m_next = m_next->m_next;
2409	}
2410	m = m0;
2411	m->m_next = m_next;
2412	if ((m_next->m_flags & M_EXT) == 0) {
2413		m = m_defrag(m, M_NOWAIT);
2414	} else {
2415		tmp = m_next->m_next;
2416		memcpy(m_next, m, MPKTHSIZE);
2417		m = m_next;
2418		m->m_next = tmp;
2419	}
2420	return (m);
2421}
2422
2423/*
2424 * If dodgy hardware rejects the scatter gather chain we've handed it
2425 * we'll need to remove the mbuf chain from ifsg_m[] before we can add the
2426 * m_defrag'd mbufs
2427 */
2428static __noinline struct mbuf *
2429iflib_remove_mbuf(iflib_txq_t txq)
2430{
2431	int ntxd, i, pidx;
2432	struct mbuf *m, *mh, **ifsd_m;
2433
2434	pidx = txq->ift_pidx;
2435	ifsd_m = txq->ift_sds.ifsd_m;
2436	ntxd = txq->ift_size;
2437	mh = m = ifsd_m[pidx];
2438	ifsd_m[pidx] = NULL;
2439#if MEMORY_LOGGING
2440	txq->ift_dequeued++;
2441#endif
2442	i = 1;
2443
2444	while (m) {
2445		ifsd_m[(pidx + i) & (ntxd -1)] = NULL;
2446#if MEMORY_LOGGING
2447		txq->ift_dequeued++;
2448#endif
2449		m = m->m_next;
2450		i++;
2451	}
2452	return (mh);
2453}
2454
2455static int
2456iflib_busdma_load_mbuf_sg(iflib_txq_t txq, bus_dma_tag_t tag, bus_dmamap_t map,
2457			  struct mbuf **m0, bus_dma_segment_t *segs, int *nsegs,
2458			  int max_segs, int flags)
2459{
2460	if_ctx_t ctx;
2461	if_shared_ctx_t		sctx;
2462	if_softc_ctx_t		scctx;
2463	int i, next, pidx, mask, err, maxsegsz, ntxd, count;
2464	struct mbuf *m, *tmp, **ifsd_m, **mp;
2465
2466	m = *m0;
2467
2468	/*
2469	 * Please don't ever do this
2470	 */
2471	if (__predict_false(m->m_len == 0))
2472		*m0 = m = collapse_pkthdr(m);
2473
2474	ctx = txq->ift_ctx;
2475	sctx = ctx->ifc_sctx;
2476	scctx = &ctx->ifc_softc_ctx;
2477	ifsd_m = txq->ift_sds.ifsd_m;
2478	ntxd = txq->ift_size;
2479	pidx = txq->ift_pidx;
2480	if (map != NULL) {
2481		uint8_t *ifsd_flags = txq->ift_sds.ifsd_flags;
2482
2483		err = bus_dmamap_load_mbuf_sg(tag, map,
2484					      *m0, segs, nsegs, BUS_DMA_NOWAIT);
2485		if (err)
2486			return (err);
2487		ifsd_flags[pidx] |= TX_SW_DESC_MAPPED;
2488		i = 0;
2489		next = pidx;
2490		mask = (txq->ift_size-1);
2491		m = *m0;
2492		do {
2493			mp = &ifsd_m[next];
2494			*mp = m;
2495			m = m->m_next;
2496			if (__predict_false((*mp)->m_len == 0)) {
2497				m_free(*mp);
2498				*mp = NULL;
2499			} else
2500				next = (pidx + i) & (ntxd-1);
2501		} while (m != NULL);
2502	} else {
2503		int buflen, sgsize, max_sgsize;
2504		vm_offset_t vaddr;
2505		vm_paddr_t curaddr;
2506
2507		count = i = 0;
2508		maxsegsz = sctx->isc_tx_maxsize;
2509		m = *m0;
2510		do {
2511			if (__predict_false(m->m_len <= 0)) {
2512				tmp = m;
2513				m = m->m_next;
2514				tmp->m_next = NULL;
2515				m_free(tmp);
2516				continue;
2517			}
2518			buflen = m->m_len;
2519			vaddr = (vm_offset_t)m->m_data;
2520			/*
2521			 * see if we can't be smarter about physically
2522			 * contiguous mappings
2523			 */
2524			next = (pidx + count) & (ntxd-1);
2525			MPASS(ifsd_m[next] == NULL);
2526#if MEMORY_LOGGING
2527			txq->ift_enqueued++;
2528#endif
2529			ifsd_m[next] = m;
2530			while (buflen > 0) {
2531				max_sgsize = MIN(buflen, maxsegsz);
2532				curaddr = pmap_kextract(vaddr);
2533				sgsize = PAGE_SIZE - (curaddr & PAGE_MASK);
2534				sgsize = MIN(sgsize, max_sgsize);
2535				segs[i].ds_addr = curaddr;
2536				segs[i].ds_len = sgsize;
2537				vaddr += sgsize;
2538				buflen -= sgsize;
2539				i++;
2540				if (i >= max_segs)
2541					goto err;
2542			}
2543			count++;
2544			tmp = m;
2545			m = m->m_next;
2546		} while (m != NULL);
2547		*nsegs = i;
2548	}
2549	return (0);
2550err:
2551	*m0 = iflib_remove_mbuf(txq);
2552	return (EFBIG);
2553}
2554
2555static int
2556iflib_encap(iflib_txq_t txq, struct mbuf **m_headp)
2557{
2558	if_ctx_t		ctx;
2559	if_shared_ctx_t		sctx;
2560	if_softc_ctx_t		scctx;
2561	bus_dma_segment_t	*segs;
2562	struct mbuf		*m_head;
2563	bus_dmamap_t		map;
2564	struct if_pkt_info	pi;
2565	int remap = 0;
2566	int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd;
2567	bus_dma_tag_t desc_tag;
2568
2569	segs = txq->ift_segs;
2570	ctx = txq->ift_ctx;
2571	sctx = ctx->ifc_sctx;
2572	scctx = &ctx->ifc_softc_ctx;
2573	segs = txq->ift_segs;
2574	ntxd = txq->ift_size;
2575	m_head = *m_headp;
2576	map = NULL;
2577
2578	/*
2579	 * If we're doing TSO the next descriptor to clean may be quite far ahead
2580	 */
2581	cidx = txq->ift_cidx;
2582	pidx = txq->ift_pidx;
2583	next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1);
2584
2585	/* prefetch the next cache line of mbuf pointers and flags */
2586	prefetch(&txq->ift_sds.ifsd_m[next]);
2587	if (txq->ift_sds.ifsd_map != NULL) {
2588		prefetch(&txq->ift_sds.ifsd_map[next]);
2589		map = txq->ift_sds.ifsd_map[pidx];
2590		next = (cidx + CACHE_LINE_SIZE) & (ntxd-1);
2591		prefetch(&txq->ift_sds.ifsd_flags[next]);
2592	}
2593
2594
2595	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
2596		desc_tag = txq->ift_tso_desc_tag;
2597		max_segs = scctx->isc_tx_tso_segments_max;
2598	} else {
2599		desc_tag = txq->ift_desc_tag;
2600		max_segs = scctx->isc_tx_nsegments;
2601	}
2602	m_head = *m_headp;
2603	bzero(&pi, sizeof(pi));
2604	pi.ipi_len = m_head->m_pkthdr.len;
2605	pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST));
2606	pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
2607	pi.ipi_vtag = (m_head->m_flags & M_VLANTAG) ? m_head->m_pkthdr.ether_vtag : 0;
2608	pi.ipi_pidx = pidx;
2609	pi.ipi_qsidx = txq->ift_id;
2610
2611	/* deliberate bitwise OR to make one condition */
2612	if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) {
2613		if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0))
2614			return (err);
2615		m_head = *m_headp;
2616	}
2617
2618retry:
2619	err = iflib_busdma_load_mbuf_sg(txq, desc_tag, map, m_headp, segs, &nsegs, max_segs, BUS_DMA_NOWAIT);
2620defrag:
2621	if (__predict_false(err)) {
2622		switch (err) {
2623		case EFBIG:
2624			/* try collapse once and defrag once */
2625			if (remap == 0)
2626				m_head = m_collapse(*m_headp, M_NOWAIT, max_segs);
2627			if (remap == 1)
2628				m_head = m_defrag(*m_headp, M_NOWAIT);
2629			remap++;
2630			if (__predict_false(m_head == NULL))
2631				goto defrag_failed;
2632			txq->ift_mbuf_defrag++;
2633			*m_headp = m_head;
2634			goto retry;
2635			break;
2636		case ENOMEM:
2637			txq->ift_no_tx_dma_setup++;
2638			break;
2639		default:
2640			txq->ift_no_tx_dma_setup++;
2641			m_freem(*m_headp);
2642			DBG_COUNTER_INC(tx_frees);
2643			*m_headp = NULL;
2644			break;
2645		}
2646		txq->ift_map_failed++;
2647		DBG_COUNTER_INC(encap_load_mbuf_fail);
2648		return (err);
2649	}
2650
2651	/*
2652	 * XXX assumes a 1 to 1 relationship between segments and
2653	 *        descriptors - this does not hold true on all drivers, e.g.
2654	 *        cxgb
2655	 */
2656	if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
2657		txq->ift_no_desc_avail++;
2658		if (map != NULL)
2659			bus_dmamap_unload(desc_tag, map);
2660		DBG_COUNTER_INC(encap_txq_avail_fail);
2661		if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
2662			GROUPTASK_ENQUEUE(&txq->ift_task);
2663		return (ENOBUFS);
2664	}
2665	pi.ipi_segs = segs;
2666	pi.ipi_nsegs = nsegs;
2667
2668	MPASS(pidx >= 0 && pidx < txq->ift_size);
2669#ifdef PKT_DEBUG
2670	print_pkt(&pi);
2671#endif
2672	if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) {
2673		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
2674						BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
2675
2676		DBG_COUNTER_INC(tx_encap);
2677		MPASS(pi.ipi_new_pidx >= 0 &&
2678		    pi.ipi_new_pidx < txq->ift_size);
2679
2680		ndesc = pi.ipi_new_pidx - pi.ipi_pidx;
2681		if (pi.ipi_new_pidx < pi.ipi_pidx) {
2682			ndesc += txq->ift_size;
2683			txq->ift_gen = 1;
2684		}
2685		MPASS(pi.ipi_new_pidx != pidx);
2686		MPASS(ndesc > 0);
2687		txq->ift_in_use += ndesc;
2688		/*
2689		 * We update the last software descriptor again here because there may
2690		 * be a sentinel and/or there may be more mbufs than segments
2691		 */
2692		txq->ift_pidx = pi.ipi_new_pidx;
2693		txq->ift_npending += pi.ipi_ndescs;
2694	} else if (__predict_false(err == EFBIG && remap < 2)) {
2695		*m_headp = m_head = iflib_remove_mbuf(txq);
2696		remap = 1;
2697		txq->ift_txd_encap_efbig++;
2698		goto defrag;
2699	} else
2700		DBG_COUNTER_INC(encap_txd_encap_fail);
2701	return (err);
2702
2703defrag_failed:
2704	txq->ift_mbuf_defrag_failed++;
2705	txq->ift_map_failed++;
2706	m_freem(*m_headp);
2707	DBG_COUNTER_INC(tx_frees);
2708	*m_headp = NULL;
2709	return (ENOMEM);
2710}
2711
2712/* forward compatibility for cxgb */
2713#define FIRST_QSET(ctx) 0
2714
2715#define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
2716#define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
2717#define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
2718#define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
2719#define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
2720#define MAX_TX_DESC(ctx) ((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max)
2721
2722
2723
2724/* if there are more than TXQ_MIN_OCCUPANCY packets pending we consider deferring
2725 * doorbell writes
2726 *
2727 * ORing with 2 assures that min occupancy is never less than 2 without any conditional logic
2728 */
2729#define TXQ_MIN_OCCUPANCY(size) ((size >> 6)| 0x2)
2730
2731static inline int
2732iflib_txq_min_occupancy(iflib_txq_t txq)
2733{
2734	if_ctx_t ctx;
2735
2736	ctx = txq->ift_ctx;
2737	return (get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx,
2738	    txq->ift_gen) < TXQ_MIN_OCCUPANCY(txq->ift_size) +
2739	    MAX_TX_DESC(ctx));
2740}
2741
2742static void
2743iflib_tx_desc_free(iflib_txq_t txq, int n)
2744{
2745	int hasmap;
2746	uint32_t qsize, cidx, mask, gen;
2747	struct mbuf *m, **ifsd_m;
2748	uint8_t *ifsd_flags;
2749	bus_dmamap_t *ifsd_map;
2750
2751	cidx = txq->ift_cidx;
2752	gen = txq->ift_gen;
2753	qsize = txq->ift_size;
2754	mask = qsize-1;
2755	hasmap = txq->ift_sds.ifsd_map != NULL;
2756	ifsd_flags = txq->ift_sds.ifsd_flags;
2757	ifsd_m = txq->ift_sds.ifsd_m;
2758	ifsd_map = txq->ift_sds.ifsd_map;
2759
2760	while (n--) {
2761		prefetch(ifsd_m[(cidx + 3) & mask]);
2762		prefetch(ifsd_m[(cidx + 4) & mask]);
2763
2764		if (ifsd_m[cidx] != NULL) {
2765			prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]);
2766			prefetch(&ifsd_flags[(cidx + CACHE_PTR_INCREMENT) & mask]);
2767			if (hasmap && (ifsd_flags[cidx] & TX_SW_DESC_MAPPED)) {
2768				/*
2769				 * does it matter if it's not the TSO tag? If so we'll
2770				 * have to add the type to flags
2771				 */
2772				bus_dmamap_unload(txq->ift_desc_tag, ifsd_map[cidx]);
2773				ifsd_flags[cidx] &= ~TX_SW_DESC_MAPPED;
2774			}
2775			if ((m = ifsd_m[cidx]) != NULL) {
2776				/* XXX we don't support any drivers that batch packets yet */
2777				MPASS(m->m_nextpkt == NULL);
2778
2779				m_free(m);
2780				ifsd_m[cidx] = NULL;
2781#if MEMORY_LOGGING
2782				txq->ift_dequeued++;
2783#endif
2784				DBG_COUNTER_INC(tx_frees);
2785			}
2786		}
2787		if (__predict_false(++cidx == qsize)) {
2788			cidx = 0;
2789			gen = 0;
2790		}
2791	}
2792	txq->ift_cidx = cidx;
2793	txq->ift_gen = gen;
2794}
2795
2796static __inline int
2797iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh)
2798{
2799	int reclaim;
2800	if_ctx_t ctx = txq->ift_ctx;
2801
2802	KASSERT(thresh >= 0, ("invalid threshold to reclaim"));
2803	MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size);
2804
2805	/*
2806	 * Need a rate-limiting check so that this isn't called every time
2807	 */
2808	iflib_tx_credits_update(ctx, txq);
2809	reclaim = DESC_RECLAIMABLE(txq);
2810
2811	if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) {
2812#ifdef INVARIANTS
2813		if (iflib_verbose_debug) {
2814			printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __FUNCTION__,
2815			       txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments,
2816			       reclaim, thresh);
2817
2818		}
2819#endif
2820		return (0);
2821	}
2822	iflib_tx_desc_free(txq, reclaim);
2823	txq->ift_cleaned += reclaim;
2824	txq->ift_in_use -= reclaim;
2825
2826	if (txq->ift_active == FALSE)
2827		txq->ift_active = TRUE;
2828
2829	return (reclaim);
2830}
2831
2832static struct mbuf **
2833_ring_peek_one(struct ifmp_ring *r, int cidx, int offset)
2834{
2835
2836	return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (r->size-1)]));
2837}
2838
2839static void
2840iflib_txq_check_drain(iflib_txq_t txq, int budget)
2841{
2842
2843	ifmp_ring_check_drainage(txq->ift_br[0], budget);
2844}
2845
2846static uint32_t
2847iflib_txq_can_drain(struct ifmp_ring *r)
2848{
2849	iflib_txq_t txq = r->cookie;
2850	if_ctx_t ctx = txq->ift_ctx;
2851
2852	return ((TXQ_AVAIL(txq) >= MAX_TX_DESC(ctx)) ||
2853		ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, txq->ift_cidx_processed, false));
2854}
2855
2856static uint32_t
2857iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
2858{
2859	iflib_txq_t txq = r->cookie;
2860	if_ctx_t ctx = txq->ift_ctx;
2861	if_t ifp = ctx->ifc_ifp;
2862	struct mbuf **mp, *m;
2863	int i, count, consumed, pkt_sent, bytes_sent, mcast_sent, avail, err, in_use_prev, desc_used;
2864
2865	if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) ||
2866			    !LINK_ACTIVE(ctx))) {
2867		DBG_COUNTER_INC(txq_drain_notready);
2868		return (0);
2869	}
2870
2871	avail = IDXDIFF(pidx, cidx, r->size);
2872	if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) {
2873		DBG_COUNTER_INC(txq_drain_flushing);
2874		for (i = 0; i < avail; i++) {
2875			m_free(r->items[(cidx + i) & (r->size-1)]);
2876			r->items[(cidx + i) & (r->size-1)] = NULL;
2877		}
2878		return (avail);
2879	}
2880	iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
2881	if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
2882		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
2883		CALLOUT_LOCK(txq);
2884		callout_stop(&txq->ift_timer);
2885		callout_stop(&txq->ift_db_check);
2886		CALLOUT_UNLOCK(txq);
2887		DBG_COUNTER_INC(txq_drain_oactive);
2888		return (0);
2889	}
2890	consumed = mcast_sent = bytes_sent = pkt_sent = 0;
2891	count = MIN(avail, TX_BATCH_SIZE);
2892
2893	for (desc_used = i = 0; i < count && TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2; i++) {
2894		mp = _ring_peek_one(r, cidx, i);
2895		in_use_prev = txq->ift_in_use;
2896		err = iflib_encap(txq, mp);
2897		/*
2898		 * What other errors should we bail out for?
2899		 */
2900		if (err == ENOBUFS) {
2901			DBG_COUNTER_INC(txq_drain_encapfail);
2902			break;
2903		}
2904		consumed++;
2905		if (err)
2906			continue;
2907
2908		pkt_sent++;
2909		m = *mp;
2910		DBG_COUNTER_INC(tx_sent);
2911		bytes_sent += m->m_pkthdr.len;
2912		if (m->m_flags & M_MCAST)
2913			mcast_sent++;
2914
2915		txq->ift_db_pending += (txq->ift_in_use - in_use_prev);
2916		desc_used += (txq->ift_in_use - in_use_prev);
2917		iflib_txd_db_check(ctx, txq, FALSE);
2918		ETHER_BPF_MTAP(ifp, m);
2919		if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
2920			break;
2921
2922		if (desc_used > TXQ_MAX_DB_CONSUMED(txq->ift_size))
2923			break;
2924	}
2925
2926	if ((iflib_min_tx_latency || iflib_txq_min_occupancy(txq)) && txq->ift_db_pending)
2927		iflib_txd_db_check(ctx, txq, TRUE);
2928	else if ((txq->ift_db_pending || TXQ_AVAIL(txq) < MAX_TX_DESC(ctx)) &&
2929		 (callout_pending(&txq->ift_db_check) == 0)) {
2930		txq->ift_db_pending_queued = txq->ift_db_pending;
2931		callout_reset_on(&txq->ift_db_check, 1, iflib_txd_deferred_db_check,
2932				 txq, txq->ift_db_check.c_cpu);
2933	}
2934	if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
2935	if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
2936	if (mcast_sent)
2937		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
2938
2939	return (consumed);
2940}
2941
2942static void
2943_task_fn_tx(void *context)
2944{
2945	iflib_txq_t txq = context;
2946	if_ctx_t ctx = txq->ift_ctx;
2947
2948	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
2949		return;
2950	ifmp_ring_check_drainage(txq->ift_br[0], TX_BATCH_SIZE);
2951}
2952
2953static void
2954_task_fn_rx(void *context)
2955{
2956	iflib_rxq_t rxq = context;
2957	if_ctx_t ctx = rxq->ifr_ctx;
2958	bool more;
2959	int rc;
2960
2961	DBG_COUNTER_INC(task_fn_rxs);
2962	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
2963		return;
2964
2965	if ((more = iflib_rxeof(rxq, 16 /* XXX */)) == false) {
2966		if (ctx->ifc_flags & IFC_LEGACY)
2967			IFDI_INTR_ENABLE(ctx);
2968		else {
2969			DBG_COUNTER_INC(rx_intr_enables);
2970			rc = IFDI_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
2971			KASSERT(rc != ENOTSUP, ("MSI-X support requires queue_intr_enable, but not implemented in driver"));
2972		}
2973	}
2974	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
2975		return;
2976	if (more)
2977		GROUPTASK_ENQUEUE(&rxq->ifr_task);
2978}
2979
2980static void
2981_task_fn_admin(void *context)
2982{
2983	if_ctx_t ctx = context;
2984	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
2985	iflib_txq_t txq;
2986	int i;
2987
2988	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
2989		return;
2990
2991	CTX_LOCK(ctx);
2992	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
2993		CALLOUT_LOCK(txq);
2994		callout_stop(&txq->ift_timer);
2995		CALLOUT_UNLOCK(txq);
2996	}
2997	IFDI_UPDATE_ADMIN_STATUS(ctx);
2998	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
2999		callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, txq->ift_timer.c_cpu);
3000	IFDI_LINK_INTR_ENABLE(ctx);
3001	CTX_UNLOCK(ctx);
3002
3003	if (LINK_ACTIVE(ctx) == 0)
3004		return;
3005	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
3006		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
3007}
3008
3009
3010static void
3011_task_fn_iov(void *context)
3012{
3013	if_ctx_t ctx = context;
3014
3015	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
3016		return;
3017
3018	CTX_LOCK(ctx);
3019	IFDI_VFLR_HANDLE(ctx);
3020	CTX_UNLOCK(ctx);
3021}
3022
3023static int
3024iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS)
3025{
3026	int err;
3027	if_int_delay_info_t info;
3028	if_ctx_t ctx;
3029
3030	info = (if_int_delay_info_t)arg1;
3031	ctx = info->iidi_ctx;
3032	info->iidi_req = req;
3033	info->iidi_oidp = oidp;
3034	CTX_LOCK(ctx);
3035	err = IFDI_SYSCTL_INT_DELAY(ctx, info);
3036	CTX_UNLOCK(ctx);
3037	return (err);
3038}
3039
3040/*********************************************************************
3041 *
3042 *  IFNET FUNCTIONS
3043 *
3044 **********************************************************************/
3045
3046static void
3047iflib_if_init_locked(if_ctx_t ctx)
3048{
3049	iflib_stop(ctx);
3050	iflib_init_locked(ctx);
3051}
3052
3053
3054static void
3055iflib_if_init(void *arg)
3056{
3057	if_ctx_t ctx = arg;
3058
3059	CTX_LOCK(ctx);
3060	iflib_if_init_locked(ctx);
3061	CTX_UNLOCK(ctx);
3062}
3063
3064static int
3065iflib_if_transmit(if_t ifp, struct mbuf *m)
3066{
3067	if_ctx_t	ctx = if_getsoftc(ifp);
3068
3069	iflib_txq_t txq;
3070	int err, qidx;
3071
3072	if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) {
3073		DBG_COUNTER_INC(tx_frees);
3074		m_freem(m);
3075		return (0);
3076	}
3077
3078	MPASS(m->m_nextpkt == NULL);
3079	qidx = 0;
3080	if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m))
3081		qidx = QIDX(ctx, m);
3082	/*
3083	 * XXX calculate buf_ring based on flowid (divvy up bits?)
3084	 */
3085	txq = &ctx->ifc_txqs[qidx];
3086
3087#ifdef DRIVER_BACKPRESSURE
3088	if (txq->ift_closed) {
3089		while (m != NULL) {
3090			next = m->m_nextpkt;
3091			m->m_nextpkt = NULL;
3092			m_freem(m);
3093			m = next;
3094		}
3095		return (ENOBUFS);
3096	}
3097#endif
3098#ifdef notyet
3099	qidx = count = 0;
3100	mp = marr;
3101	next = m;
3102	do {
3103		count++;
3104		next = next->m_nextpkt;
3105	} while (next != NULL);
3106
3107	if (count > nitems(marr))
3108		if ((mp = malloc(count*sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) {
3109			/* XXX check nextpkt */
3110			m_freem(m);
3111			/* XXX simplify for now */
3112			DBG_COUNTER_INC(tx_frees);
3113			return (ENOBUFS);
3114		}
3115	for (next = m, i = 0; next != NULL; i++) {
3116		mp[i] = next;
3117		next = next->m_nextpkt;
3118		mp[i]->m_nextpkt = NULL;
3119	}
3120#endif
3121	DBG_COUNTER_INC(tx_seen);
3122	err = ifmp_ring_enqueue(txq->ift_br[0], (void **)&m, 1, TX_BATCH_SIZE);
3123
3124	if (err) {
3125		GROUPTASK_ENQUEUE(&txq->ift_task);
3126		/* support forthcoming later */
3127#ifdef DRIVER_BACKPRESSURE
3128		txq->ift_closed = TRUE;
3129#endif
3130		ifmp_ring_check_drainage(txq->ift_br[0], TX_BATCH_SIZE);
3131		m_freem(m);
3132	} else if (TXQ_AVAIL(txq) < (txq->ift_size >> 1)) {
3133		GROUPTASK_ENQUEUE(&txq->ift_task);
3134	}
3135
3136	return (err);
3137}
3138
3139static void
3140iflib_if_qflush(if_t ifp)
3141{
3142	if_ctx_t ctx = if_getsoftc(ifp);
3143	iflib_txq_t txq = ctx->ifc_txqs;
3144	int i;
3145
3146	CTX_LOCK(ctx);
3147	ctx->ifc_flags |= IFC_QFLUSH;
3148	CTX_UNLOCK(ctx);
3149	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
3150		while (!(ifmp_ring_is_idle(txq->ift_br[0]) || ifmp_ring_is_stalled(txq->ift_br[0])))
3151			iflib_txq_check_drain(txq, 0);
3152	CTX_LOCK(ctx);
3153	ctx->ifc_flags &= ~IFC_QFLUSH;
3154	CTX_UNLOCK(ctx);
3155
3156	if_qflush(ifp);
3157}
3158
3159
3160#define IFCAP_FLAGS (IFCAP_TXCSUM_IPV6 | IFCAP_RXCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \
3161		     IFCAP_TSO4 | IFCAP_TSO6 | IFCAP_VLAN_HWTAGGING |	\
3162		     IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | IFCAP_VLAN_HWTSO)
3163
3164#define IFCAP_REINIT IFCAP_FLAGS
3165
3166static int
3167iflib_if_ioctl(if_t ifp, u_long command, caddr_t data)
3168{
3169	if_ctx_t ctx = if_getsoftc(ifp);
3170	struct ifreq	*ifr = (struct ifreq *)data;
3171#if defined(INET) || defined(INET6)
3172	struct ifaddr	*ifa = (struct ifaddr *)data;
3173#endif
3174	bool		avoid_reset = FALSE;
3175	int		err = 0, reinit = 0, bits;
3176
3177	switch (command) {
3178	case SIOCSIFADDR:
3179#ifdef INET
3180		if (ifa->ifa_addr->sa_family == AF_INET)
3181			avoid_reset = TRUE;
3182#endif
3183#ifdef INET6
3184		if (ifa->ifa_addr->sa_family == AF_INET6)
3185			avoid_reset = TRUE;
3186#endif
3187		/*
3188		** Calling init results in link renegotiation,
3189		** so we avoid doing it when possible.
3190		*/
3191		if (avoid_reset) {
3192			if_setflagbits(ifp, IFF_UP,0);
3193			if (!(if_getdrvflags(ifp)& IFF_DRV_RUNNING))
3194				reinit = 1;
3195#ifdef INET
3196			if (!(if_getflags(ifp) & IFF_NOARP))
3197				arp_ifinit(ifp, ifa);
3198#endif
3199		} else
3200			err = ether_ioctl(ifp, command, data);
3201		break;
3202	case SIOCSIFMTU:
3203		CTX_LOCK(ctx);
3204		if (ifr->ifr_mtu == if_getmtu(ifp)) {
3205			CTX_UNLOCK(ctx);
3206			break;
3207		}
3208		bits = if_getdrvflags(ifp);
3209		/* stop the driver and free any clusters before proceeding */
3210		iflib_stop(ctx);
3211
3212		if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) {
3213			if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size)
3214				ctx->ifc_flags |= IFC_MULTISEG;
3215			else
3216				ctx->ifc_flags &= ~IFC_MULTISEG;
3217			err = if_setmtu(ifp, ifr->ifr_mtu);
3218		}
3219		iflib_init_locked(ctx);
3220		if_setdrvflags(ifp, bits);
3221		CTX_UNLOCK(ctx);
3222		break;
3223	case SIOCSIFFLAGS:
3224		CTX_LOCK(ctx);
3225		if (if_getflags(ifp) & IFF_UP) {
3226			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3227				if ((if_getflags(ifp) ^ ctx->ifc_if_flags) &
3228				    (IFF_PROMISC | IFF_ALLMULTI)) {
3229					err = IFDI_PROMISC_SET(ctx, if_getflags(ifp));
3230				}
3231			} else
3232				reinit = 1;
3233		} else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3234			iflib_stop(ctx);
3235		}
3236		ctx->ifc_if_flags = if_getflags(ifp);
3237		CTX_UNLOCK(ctx);
3238		break;
3239
3240		break;
3241	case SIOCADDMULTI:
3242	case SIOCDELMULTI:
3243		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3244			CTX_LOCK(ctx);
3245			IFDI_INTR_DISABLE(ctx);
3246			IFDI_MULTI_SET(ctx);
3247			IFDI_INTR_ENABLE(ctx);
3248			CTX_UNLOCK(ctx);
3249		}
3250		break;
3251	case SIOCSIFMEDIA:
3252		CTX_LOCK(ctx);
3253		IFDI_MEDIA_SET(ctx);
3254		CTX_UNLOCK(ctx);
3255		/* falls thru */
3256	case SIOCGIFMEDIA:
3257		err = ifmedia_ioctl(ifp, ifr, &ctx->ifc_media, command);
3258		break;
3259	case SIOCGI2C:
3260	{
3261		struct ifi2creq i2c;
3262
3263		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
3264		if (err != 0)
3265			break;
3266		if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
3267			err = EINVAL;
3268			break;
3269		}
3270		if (i2c.len > sizeof(i2c.data)) {
3271			err = EINVAL;
3272			break;
3273		}
3274
3275		if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0)
3276			err = copyout(&i2c, ifr_data_get_ptr(ifr),
3277			    sizeof(i2c));
3278		break;
3279	}
3280	case SIOCSIFCAP:
3281	{
3282		int mask, setmask;
3283
3284		mask = ifr->ifr_reqcap ^ if_getcapenable(ifp);
3285		setmask = 0;
3286#ifdef TCP_OFFLOAD
3287		setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6);
3288#endif
3289		setmask |= (mask & IFCAP_FLAGS);
3290
3291		if ((mask & IFCAP_WOL) &&
3292		    (if_getcapabilities(ifp) & IFCAP_WOL) != 0)
3293			setmask |= (mask & (IFCAP_WOL_MCAST|IFCAP_WOL_MAGIC));
3294		if_vlancap(ifp);
3295		/*
3296		 * want to ensure that traffic has stopped before we change any of the flags
3297		 */
3298		if (setmask) {
3299			CTX_LOCK(ctx);
3300			bits = if_getdrvflags(ifp);
3301			if (setmask & IFCAP_REINIT)
3302				iflib_stop(ctx);
3303			if_togglecapenable(ifp, setmask);
3304			if (setmask & IFCAP_REINIT)
3305				iflib_init_locked(ctx);
3306			if_setdrvflags(ifp, bits);
3307			CTX_UNLOCK(ctx);
3308		}
3309		break;
3310	    }
3311	case SIOCGPRIVATE_0:
3312	case SIOCSDRVSPEC:
3313	case SIOCGDRVSPEC:
3314		CTX_LOCK(ctx);
3315		err = IFDI_PRIV_IOCTL(ctx, command, data);
3316		CTX_UNLOCK(ctx);
3317		break;
3318	default:
3319		err = ether_ioctl(ifp, command, data);
3320		break;
3321	}
3322	if (reinit)
3323		iflib_if_init(ctx);
3324	return (err);
3325}
3326
3327static uint64_t
3328iflib_if_get_counter(if_t ifp, ift_counter cnt)
3329{
3330	if_ctx_t ctx = if_getsoftc(ifp);
3331
3332	return (IFDI_GET_COUNTER(ctx, cnt));
3333}
3334
3335/*********************************************************************
3336 *
3337 *  OTHER FUNCTIONS EXPORTED TO THE STACK
3338 *
3339 **********************************************************************/
3340
3341static void
3342iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag)
3343{
3344	if_ctx_t ctx = if_getsoftc(ifp);
3345
3346	if ((void *)ctx != arg)
3347		return;
3348
3349	if ((vtag == 0) || (vtag > 4095))
3350		return;
3351
3352	CTX_LOCK(ctx);
3353	IFDI_VLAN_REGISTER(ctx, vtag);
3354	/* Re-init to load the changes */
3355	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
3356		iflib_init_locked(ctx);
3357	CTX_UNLOCK(ctx);
3358}
3359
3360static void
3361iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag)
3362{
3363	if_ctx_t ctx = if_getsoftc(ifp);
3364
3365	if ((void *)ctx != arg)
3366		return;
3367
3368	if ((vtag == 0) || (vtag > 4095))
3369		return;
3370
3371	CTX_LOCK(ctx);
3372	IFDI_VLAN_UNREGISTER(ctx, vtag);
3373	/* Re-init to load the changes */
3374	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
3375		iflib_init_locked(ctx);
3376	CTX_UNLOCK(ctx);
3377}
3378
3379static void
3380iflib_led_func(void *arg, int onoff)
3381{
3382	if_ctx_t ctx = arg;
3383
3384	CTX_LOCK(ctx);
3385	IFDI_LED_FUNC(ctx, onoff);
3386	CTX_UNLOCK(ctx);
3387}
3388
3389/*********************************************************************
3390 *
3391 *  BUS FUNCTION DEFINITIONS
3392 *
3393 **********************************************************************/
3394
3395int
3396iflib_device_probe(device_t dev)
3397{
3398	pci_vendor_info_t *ent;
3399
3400	uint16_t	pci_vendor_id, pci_device_id;
3401	uint16_t	pci_subvendor_id, pci_subdevice_id;
3402	uint16_t	pci_rev_id;
3403	if_shared_ctx_t sctx;
3404
3405	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
3406		return (ENOTSUP);
3407
3408	pci_vendor_id = pci_get_vendor(dev);
3409	pci_device_id = pci_get_device(dev);
3410	pci_subvendor_id = pci_get_subvendor(dev);
3411	pci_subdevice_id = pci_get_subdevice(dev);
3412	pci_rev_id = pci_get_revid(dev);
3413	if (sctx->isc_parse_devinfo != NULL)
3414		sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id);
3415
3416	ent = sctx->isc_vendor_info;
3417	while (ent->pvi_vendor_id != 0) {
3418		if (pci_vendor_id != ent->pvi_vendor_id) {
3419			ent++;
3420			continue;
3421		}
3422		if ((pci_device_id == ent->pvi_device_id) &&
3423		    ((pci_subvendor_id == ent->pvi_subvendor_id) ||
3424		     (ent->pvi_subvendor_id == 0)) &&
3425		    ((pci_subdevice_id == ent->pvi_subdevice_id) ||
3426		     (ent->pvi_subdevice_id == 0)) &&
3427		    ((pci_rev_id == ent->pvi_rev_id) ||
3428		     (ent->pvi_rev_id == 0))) {
3429
3430			device_set_desc_copy(dev, ent->pvi_name);
3431			/* this needs to be changed to zero if the bus probing code
3432			 * ever stops re-probing on best match because the sctx
3433			 * may have its values over written by register calls
3434			 * in subsequent probes
3435			 */
3436			return (BUS_PROBE_DEFAULT);
3437		}
3438		ent++;
3439	}
3440	return (ENXIO);
3441}
3442
3443int
3444iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp)
3445{
3446	int err, rid, msix, msix_bar;
3447	if_ctx_t ctx;
3448	if_t ifp;
3449	if_softc_ctx_t scctx;
3450	int i;
3451	uint16_t main_txq;
3452	uint16_t main_rxq;
3453
3454
3455	ctx = malloc(sizeof(* ctx), M_IFLIB, M_WAITOK|M_ZERO);
3456
3457	if (sc == NULL) {
3458		sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
3459		device_set_softc(dev, ctx);
3460		ctx->ifc_flags |= IFC_SC_ALLOCATED;
3461	}
3462
3463	ctx->ifc_sctx = sctx;
3464	ctx->ifc_dev = dev;
3465	ctx->ifc_txrx = *sctx->isc_txrx;
3466	ctx->ifc_softc = sc;
3467
3468	if ((err = iflib_register(ctx)) != 0) {
3469		device_printf(dev, "iflib_register failed %d\n", err);
3470		return (err);
3471	}
3472	iflib_add_device_sysctl_pre(ctx);
3473
3474	scctx = &ctx->ifc_softc_ctx;
3475	/*
3476	 * XXX sanity check that ntxd & nrxd are a power of 2
3477	 */
3478	if (ctx->ifc_sysctl_ntxqs != 0)
3479		scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs;
3480	if (ctx->ifc_sysctl_nrxqs != 0)
3481		scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs;
3482
3483	for (i = 0; i < sctx->isc_ntxqs; i++) {
3484		if (ctx->ifc_sysctl_ntxds[i] != 0)
3485			scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i];
3486		else
3487			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
3488	}
3489
3490	for (i = 0; i < sctx->isc_nrxqs; i++) {
3491		if (ctx->ifc_sysctl_nrxds[i] != 0)
3492			scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i];
3493		else
3494			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
3495	}
3496
3497	for (i = 0; i < sctx->isc_nrxqs; i++) {
3498		if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) {
3499			device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n",
3500				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]);
3501			scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i];
3502		}
3503		if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) {
3504			device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n",
3505				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]);
3506			scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i];
3507		}
3508	}
3509
3510	for (i = 0; i < sctx->isc_ntxqs; i++) {
3511		if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) {
3512			device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n",
3513				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]);
3514			scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i];
3515		}
3516		if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) {
3517			device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n",
3518				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]);
3519			scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i];
3520		}
3521	}
3522
3523	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
3524		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
3525		return (err);
3526	}
3527	if (scctx->isc_ntxqsets_max)
3528		scctx->isc_ntxqsets = min(scctx->isc_ntxqsets, scctx->isc_ntxqsets_max);
3529	if (scctx->isc_nrxqsets_max)
3530		scctx->isc_nrxqsets = min(scctx->isc_nrxqsets, scctx->isc_nrxqsets_max);
3531
3532#ifdef ACPI_DMAR
3533	if (dmar_get_dma_tag(device_get_parent(dev), dev) != NULL)
3534		ctx->ifc_flags |= IFC_DMAR;
3535#endif
3536
3537	msix_bar = scctx->isc_msix_bar;
3538
3539	ifp = ctx->ifc_ifp;
3540
3541	if(sctx->isc_flags & IFLIB_HAS_TXCQ)
3542		main_txq = 1;
3543	else
3544		main_txq = 0;
3545
3546	if(sctx->isc_flags & IFLIB_HAS_RXCQ)
3547		main_rxq = 1;
3548	else
3549		main_rxq = 0;
3550
3551	/* XXX change for per-queue sizes */
3552	device_printf(dev, "using %d tx descriptors and %d rx descriptors\n",
3553		      scctx->isc_ntxd[main_txq], scctx->isc_nrxd[main_rxq]);
3554	for (i = 0; i < sctx->isc_nrxqs; i++) {
3555		if (!powerof2(scctx->isc_nrxd[i])) {
3556			/* round down instead? */
3557			device_printf(dev, "# rx descriptors must be a power of 2\n");
3558			err = EINVAL;
3559			goto fail;
3560		}
3561	}
3562	for (i = 0; i < sctx->isc_ntxqs; i++) {
3563		if (!powerof2(scctx->isc_ntxd[i])) {
3564			device_printf(dev,
3565			    "# tx descriptors must be a power of 2");
3566			err = EINVAL;
3567			goto fail;
3568		}
3569	}
3570
3571	if (scctx->isc_tx_nsegments > scctx->isc_ntxd[main_txq] /
3572	    MAX_SINGLE_PACKET_FRACTION)
3573		scctx->isc_tx_nsegments = max(1, scctx->isc_ntxd[main_txq] /
3574		    MAX_SINGLE_PACKET_FRACTION);
3575	if (scctx->isc_tx_tso_segments_max > scctx->isc_ntxd[main_txq] /
3576	    MAX_SINGLE_PACKET_FRACTION)
3577		scctx->isc_tx_tso_segments_max = max(1,
3578		    scctx->isc_ntxd[main_txq] / MAX_SINGLE_PACKET_FRACTION);
3579
3580	/*
3581	 * Protect the stack against modern hardware
3582	 */
3583	if (scctx->isc_tx_tso_size_max > FREEBSD_TSO_SIZE_MAX)
3584		scctx->isc_tx_tso_size_max = FREEBSD_TSO_SIZE_MAX;
3585
3586	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
3587	ifp->if_hw_tsomaxsegcount = scctx->isc_tx_tso_segments_max;
3588	ifp->if_hw_tsomax = scctx->isc_tx_tso_size_max;
3589	ifp->if_hw_tsomaxsegsize = scctx->isc_tx_tso_segsize_max;
3590	if (scctx->isc_rss_table_size == 0)
3591		scctx->isc_rss_table_size = 64;
3592	scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
3593	/*
3594	** Now setup MSI or MSI/X, should
3595	** return us the number of supported
3596	** vectors. (Will be 1 for MSI)
3597	*/
3598	if (sctx->isc_flags & IFLIB_SKIP_MSIX) {
3599		msix = scctx->isc_vectors;
3600	} else if (scctx->isc_msix_bar != 0)
3601		msix = iflib_msix_init(ctx);
3602	else {
3603		scctx->isc_vectors = 1;
3604		scctx->isc_ntxqsets = 1;
3605		scctx->isc_nrxqsets = 1;
3606		scctx->isc_intr = IFLIB_INTR_LEGACY;
3607		msix = 0;
3608	}
3609	/* Get memory for the station queues */
3610	if ((err = iflib_queues_alloc(ctx))) {
3611		device_printf(dev, "Unable to allocate queue memory\n");
3612		goto fail;
3613	}
3614
3615	if ((err = iflib_qset_structures_setup(ctx))) {
3616		device_printf(dev, "qset structure setup failed %d\n", err);
3617		goto fail_queues;
3618	}
3619
3620	if (msix > 1 && (err = IFDI_MSIX_INTR_ASSIGN(ctx, msix)) != 0) {
3621		device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n", err);
3622		goto fail_intr_free;
3623	}
3624	if (msix <= 1) {
3625		rid = 0;
3626		if (scctx->isc_intr == IFLIB_INTR_MSI) {
3627			MPASS(msix == 1);
3628			rid = 1;
3629		}
3630		if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) {
3631			device_printf(dev, "iflib_legacy_setup failed %d\n", err);
3632			goto fail_intr_free;
3633		}
3634	}
3635	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac);
3636	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
3637		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
3638		goto fail_detach;
3639	}
3640	if ((err = iflib_netmap_attach(ctx))) {
3641		device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err);
3642		goto fail_detach;
3643	}
3644	*ctxp = ctx;
3645
3646	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
3647	iflib_add_device_sysctl_post(ctx);
3648	return (0);
3649fail_detach:
3650	ether_ifdetach(ctx->ifc_ifp);
3651fail_intr_free:
3652	if (scctx->isc_intr == IFLIB_INTR_MSIX || scctx->isc_intr == IFLIB_INTR_MSI)
3653		pci_release_msi(ctx->ifc_dev);
3654fail_queues:
3655	/* XXX free queues */
3656fail:
3657	IFDI_DETACH(ctx);
3658	return (err);
3659}
3660
3661int
3662iflib_device_attach(device_t dev)
3663{
3664	if_ctx_t ctx;
3665	if_shared_ctx_t sctx;
3666
3667	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
3668		return (ENOTSUP);
3669
3670	pci_enable_busmaster(dev);
3671
3672	return (iflib_device_register(dev, NULL, sctx, &ctx));
3673}
3674
3675int
3676iflib_device_deregister(if_ctx_t ctx)
3677{
3678	if_t ifp = ctx->ifc_ifp;
3679	iflib_txq_t txq;
3680	iflib_rxq_t rxq;
3681	device_t dev = ctx->ifc_dev;
3682	int i;
3683	struct taskqgroup *tqg;
3684
3685	/* Make sure VLANS are not using driver */
3686	if (if_vlantrunkinuse(ifp)) {
3687		device_printf(dev,"Vlan in use, detach first\n");
3688		return (EBUSY);
3689	}
3690
3691	CTX_LOCK(ctx);
3692	ctx->ifc_in_detach = 1;
3693	iflib_stop(ctx);
3694	CTX_UNLOCK(ctx);
3695
3696	/* Unregister VLAN events */
3697	if (ctx->ifc_vlan_attach_event != NULL)
3698		EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event);
3699	if (ctx->ifc_vlan_detach_event != NULL)
3700		EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event);
3701
3702	iflib_netmap_detach(ifp);
3703	ether_ifdetach(ifp);
3704	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
3705	CTX_LOCK_DESTROY(ctx);
3706	if (ctx->ifc_led_dev != NULL)
3707		led_destroy(ctx->ifc_led_dev);
3708	/* XXX drain any dependent tasks */
3709	tqg = qgroup_softirq;
3710	for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
3711		callout_drain(&txq->ift_timer);
3712		callout_drain(&txq->ift_db_check);
3713		if (txq->ift_task.gt_uniq != NULL)
3714			taskqgroup_detach(tqg, &txq->ift_task);
3715	}
3716	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
3717		if (rxq->ifr_task.gt_uniq != NULL)
3718			taskqgroup_detach(tqg, &rxq->ifr_task);
3719	}
3720	tqg = qgroup_if_config_tqg;
3721	if (ctx->ifc_admin_task.gt_uniq != NULL)
3722		taskqgroup_detach(tqg, &ctx->ifc_admin_task);
3723	if (ctx->ifc_vflr_task.gt_uniq != NULL)
3724		taskqgroup_detach(tqg, &ctx->ifc_vflr_task);
3725
3726	IFDI_DETACH(ctx);
3727	device_set_softc(ctx->ifc_dev, NULL);
3728	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) {
3729		pci_release_msi(dev);
3730	}
3731	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) {
3732		iflib_irq_free(ctx, &ctx->ifc_legacy_irq);
3733	}
3734	if (ctx->ifc_msix_mem != NULL) {
3735		bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY,
3736			ctx->ifc_softc_ctx.isc_msix_bar, ctx->ifc_msix_mem);
3737		ctx->ifc_msix_mem = NULL;
3738	}
3739
3740	bus_generic_detach(dev);
3741	if_free(ifp);
3742
3743	iflib_tx_structures_free(ctx);
3744	iflib_rx_structures_free(ctx);
3745	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
3746		free(ctx->ifc_softc, M_IFLIB);
3747	free(ctx, M_IFLIB);
3748	return (0);
3749}
3750
3751
3752int
3753iflib_device_detach(device_t dev)
3754{
3755	if_ctx_t ctx = device_get_softc(dev);
3756
3757	return (iflib_device_deregister(ctx));
3758}
3759
3760int
3761iflib_device_suspend(device_t dev)
3762{
3763	if_ctx_t ctx = device_get_softc(dev);
3764
3765	CTX_LOCK(ctx);
3766	IFDI_SUSPEND(ctx);
3767	CTX_UNLOCK(ctx);
3768
3769	return bus_generic_suspend(dev);
3770}
3771int
3772iflib_device_shutdown(device_t dev)
3773{
3774	if_ctx_t ctx = device_get_softc(dev);
3775
3776	CTX_LOCK(ctx);
3777	IFDI_SHUTDOWN(ctx);
3778	CTX_UNLOCK(ctx);
3779
3780	return bus_generic_suspend(dev);
3781}
3782
3783
3784int
3785iflib_device_resume(device_t dev)
3786{
3787	if_ctx_t ctx = device_get_softc(dev);
3788	iflib_txq_t txq = ctx->ifc_txqs;
3789
3790	CTX_LOCK(ctx);
3791	IFDI_RESUME(ctx);
3792	iflib_init_locked(ctx);
3793	CTX_UNLOCK(ctx);
3794	for (int i = 0; i < NTXQSETS(ctx); i++, txq++)
3795		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
3796
3797	return (bus_generic_resume(dev));
3798}
3799
3800int
3801iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params)
3802{
3803	int error;
3804	if_ctx_t ctx = device_get_softc(dev);
3805
3806	CTX_LOCK(ctx);
3807	error = IFDI_IOV_INIT(ctx, num_vfs, params);
3808	CTX_UNLOCK(ctx);
3809
3810	return (error);
3811}
3812
3813void
3814iflib_device_iov_uninit(device_t dev)
3815{
3816	if_ctx_t ctx = device_get_softc(dev);
3817
3818	CTX_LOCK(ctx);
3819	IFDI_IOV_UNINIT(ctx);
3820	CTX_UNLOCK(ctx);
3821}
3822
3823int
3824iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params)
3825{
3826	int error;
3827	if_ctx_t ctx = device_get_softc(dev);
3828
3829	CTX_LOCK(ctx);
3830	error = IFDI_IOV_VF_ADD(ctx, vfnum, params);
3831	CTX_UNLOCK(ctx);
3832
3833	return (error);
3834}
3835
3836/*********************************************************************
3837 *
3838 *  MODULE FUNCTION DEFINITIONS
3839 *
3840 **********************************************************************/
3841
3842/*
3843 * - Start a fast taskqueue thread for each core
3844 * - Start a taskqueue for control operations
3845 */
3846static int
3847iflib_module_init(void)
3848{
3849	return (0);
3850}
3851
3852static int
3853iflib_module_event_handler(module_t mod, int what, void *arg)
3854{
3855	int err;
3856
3857	switch (what) {
3858	case MOD_LOAD:
3859		if ((err = iflib_module_init()) != 0)
3860			return (err);
3861		break;
3862	case MOD_UNLOAD:
3863		return (EBUSY);
3864	default:
3865		return (EOPNOTSUPP);
3866	}
3867
3868	return (0);
3869}
3870
3871/*********************************************************************
3872 *
3873 *  PUBLIC FUNCTION DEFINITIONS
3874 *     ordered as in iflib.h
3875 *
3876 **********************************************************************/
3877
3878
3879static void
3880_iflib_assert(if_shared_ctx_t sctx)
3881{
3882	MPASS(sctx->isc_tx_maxsize);
3883	MPASS(sctx->isc_tx_maxsegsize);
3884
3885	MPASS(sctx->isc_rx_maxsize);
3886	MPASS(sctx->isc_rx_nsegments);
3887	MPASS(sctx->isc_rx_maxsegsize);
3888
3889
3890	MPASS(sctx->isc_txrx->ift_txd_encap);
3891	MPASS(sctx->isc_txrx->ift_txd_flush);
3892	MPASS(sctx->isc_txrx->ift_txd_credits_update);
3893	MPASS(sctx->isc_txrx->ift_rxd_available);
3894	MPASS(sctx->isc_txrx->ift_rxd_pkt_get);
3895	MPASS(sctx->isc_txrx->ift_rxd_refill);
3896	MPASS(sctx->isc_txrx->ift_rxd_flush);
3897
3898	MPASS(sctx->isc_nrxd_min[0]);
3899	MPASS(sctx->isc_nrxd_max[0]);
3900	MPASS(sctx->isc_nrxd_default[0]);
3901	MPASS(sctx->isc_ntxd_min[0]);
3902	MPASS(sctx->isc_ntxd_max[0]);
3903	MPASS(sctx->isc_ntxd_default[0]);
3904}
3905
3906static int
3907iflib_register(if_ctx_t ctx)
3908{
3909	if_shared_ctx_t sctx = ctx->ifc_sctx;
3910	driver_t *driver = sctx->isc_driver;
3911	device_t dev = ctx->ifc_dev;
3912	if_t ifp;
3913
3914	_iflib_assert(sctx);
3915
3916	CTX_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev));
3917
3918	ifp = ctx->ifc_ifp = if_gethandle(IFT_ETHER);
3919	if (ifp == NULL) {
3920		device_printf(dev, "can not allocate ifnet structure\n");
3921		return (ENOMEM);
3922	}
3923
3924	/*
3925	 * Initialize our context's device specific methods
3926	 */
3927	kobj_init((kobj_t) ctx, (kobj_class_t) driver);
3928	kobj_class_compile((kobj_class_t) driver);
3929	driver->refs++;
3930
3931	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
3932	if_setsoftc(ifp, ctx);
3933	if_setdev(ifp, dev);
3934	if_setinitfn(ifp, iflib_if_init);
3935	if_setioctlfn(ifp, iflib_if_ioctl);
3936	if_settransmitfn(ifp, iflib_if_transmit);
3937	if_setqflushfn(ifp, iflib_if_qflush);
3938	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
3939
3940	if_setcapabilities(ifp, 0);
3941	if_setcapenable(ifp, 0);
3942
3943	ctx->ifc_vlan_attach_event =
3944		EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx,
3945							  EVENTHANDLER_PRI_FIRST);
3946	ctx->ifc_vlan_detach_event =
3947		EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx,
3948							  EVENTHANDLER_PRI_FIRST);
3949
3950	ifmedia_init(&ctx->ifc_media, IFM_IMASK,
3951					 iflib_media_change, iflib_media_status);
3952
3953	return (0);
3954}
3955
3956
3957static int
3958iflib_queues_alloc(if_ctx_t ctx)
3959{
3960	if_shared_ctx_t sctx = ctx->ifc_sctx;
3961	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
3962	device_t dev = ctx->ifc_dev;
3963	int nrxqsets = scctx->isc_nrxqsets;
3964	int ntxqsets = scctx->isc_ntxqsets;
3965	iflib_txq_t txq;
3966	iflib_rxq_t rxq;
3967	iflib_fl_t fl = NULL;
3968	int i, j, cpu, err, txconf, rxconf;
3969	iflib_dma_info_t ifdip;
3970	uint32_t *rxqsizes = scctx->isc_rxqsizes;
3971	uint32_t *txqsizes = scctx->isc_txqsizes;
3972	uint8_t nrxqs = sctx->isc_nrxqs;
3973	uint8_t ntxqs = sctx->isc_ntxqs;
3974	int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1;
3975	caddr_t *vaddrs;
3976	uint64_t *paddrs;
3977	struct ifmp_ring **brscp;
3978	int nbuf_rings = 1; /* XXX determine dynamically */
3979
3980	KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1"));
3981	KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1"));
3982
3983	brscp = NULL;
3984	txq = NULL;
3985	rxq = NULL;
3986
3987/* Allocate the TX ring struct memory */
3988	if (!(txq =
3989	    (iflib_txq_t) malloc(sizeof(struct iflib_txq) *
3990	    ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
3991		device_printf(dev, "Unable to allocate TX ring memory\n");
3992		err = ENOMEM;
3993		goto fail;
3994	}
3995
3996	/* Now allocate the RX */
3997	if (!(rxq =
3998	    (iflib_rxq_t) malloc(sizeof(struct iflib_rxq) *
3999	    nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
4000		device_printf(dev, "Unable to allocate RX ring memory\n");
4001		err = ENOMEM;
4002		goto rx_fail;
4003	}
4004	if (!(brscp = malloc(sizeof(void *) * nbuf_rings * nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
4005		device_printf(dev, "Unable to buf_ring_sc * memory\n");
4006		err = ENOMEM;
4007		goto rx_fail;
4008	}
4009
4010	ctx->ifc_txqs = txq;
4011	ctx->ifc_rxqs = rxq;
4012
4013	/*
4014	 * XXX handle allocation failure
4015	 */
4016	for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) {
4017		/* Set up some basics */
4018
4019		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs, M_IFLIB, M_WAITOK|M_ZERO)) == NULL) {
4020			device_printf(dev, "failed to allocate iflib_dma_info\n");
4021			err = ENOMEM;
4022			goto err_tx_desc;
4023		}
4024		txq->ift_ifdi = ifdip;
4025		for (j = 0; j < ntxqs; j++, ifdip++) {
4026			if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, BUS_DMA_NOWAIT)) {
4027				device_printf(dev, "Unable to allocate Descriptor memory\n");
4028				err = ENOMEM;
4029				goto err_tx_desc;
4030			}
4031			bzero((void *)ifdip->idi_vaddr, txqsizes[j]);
4032		}
4033		txq->ift_ctx = ctx;
4034		txq->ift_id = i;
4035		if (sctx->isc_flags & IFLIB_HAS_TXCQ) {
4036			txq->ift_br_offset = 1;
4037		} else {
4038			txq->ift_br_offset = 0;
4039		}
4040		/* XXX fix this */
4041		txq->ift_timer.c_cpu = cpu;
4042		txq->ift_db_check.c_cpu = cpu;
4043		txq->ift_nbr = nbuf_rings;
4044
4045		if (iflib_txsd_alloc(txq)) {
4046			device_printf(dev, "Critical Failure setting up TX buffers\n");
4047			err = ENOMEM;
4048			goto err_tx_desc;
4049		}
4050
4051		/* Initialize the TX lock */
4052		snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:tx(%d):callout",
4053		    device_get_nameunit(dev), txq->ift_id);
4054		mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF);
4055		callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0);
4056		callout_init_mtx(&txq->ift_db_check, &txq->ift_mtx, 0);
4057
4058		snprintf(txq->ift_db_mtx_name, MTX_NAME_LEN, "%s:tx(%d):db",
4059			 device_get_nameunit(dev), txq->ift_id);
4060		TXDB_LOCK_INIT(txq);
4061
4062		txq->ift_br = brscp + i*nbuf_rings;
4063		for (j = 0; j < nbuf_rings; j++) {
4064			err = ifmp_ring_alloc(&txq->ift_br[j], 2048, txq, iflib_txq_drain,
4065					      iflib_txq_can_drain, M_IFLIB, M_WAITOK);
4066			if (err) {
4067				/* XXX free any allocated rings */
4068				device_printf(dev, "Unable to allocate buf_ring\n");
4069				goto err_tx_desc;
4070			}
4071		}
4072	}
4073
4074	for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) {
4075		/* Set up some basics */
4076
4077		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs, M_IFLIB, M_WAITOK|M_ZERO)) == NULL) {
4078			device_printf(dev, "failed to allocate iflib_dma_info\n");
4079			err = ENOMEM;
4080			goto err_tx_desc;
4081		}
4082
4083		rxq->ifr_ifdi = ifdip;
4084		for (j = 0; j < nrxqs; j++, ifdip++) {
4085			if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, BUS_DMA_NOWAIT)) {
4086				device_printf(dev, "Unable to allocate Descriptor memory\n");
4087				err = ENOMEM;
4088				goto err_tx_desc;
4089			}
4090			bzero((void *)ifdip->idi_vaddr, rxqsizes[j]);
4091		}
4092		rxq->ifr_ctx = ctx;
4093		rxq->ifr_id = i;
4094		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
4095			rxq->ifr_fl_offset = 1;
4096		} else {
4097			rxq->ifr_fl_offset = 0;
4098		}
4099		rxq->ifr_nfl = nfree_lists;
4100		if (!(fl =
4101			  (iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) {
4102			device_printf(dev, "Unable to allocate free list memory\n");
4103			err = ENOMEM;
4104			goto err_tx_desc;
4105		}
4106		rxq->ifr_fl = fl;
4107		for (j = 0; j < nfree_lists; j++) {
4108			rxq->ifr_fl[j].ifl_rxq = rxq;
4109			rxq->ifr_fl[j].ifl_id = j;
4110			rxq->ifr_fl[j].ifl_ifdi =
4111			    &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
4112		}
4113        /* Allocate receive buffers for the ring*/
4114		if (iflib_rxsd_alloc(rxq)) {
4115			device_printf(dev,
4116			    "Critical Failure setting up receive buffers\n");
4117			err = ENOMEM;
4118			goto err_rx_desc;
4119		}
4120	}
4121
4122	/* TXQs */
4123	vaddrs = malloc(sizeof(caddr_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
4124	paddrs = malloc(sizeof(uint64_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
4125	for (i = 0; i < ntxqsets; i++) {
4126		iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi;
4127
4128		for (j = 0; j < ntxqs; j++, di++) {
4129			vaddrs[i*ntxqs + j] = di->idi_vaddr;
4130			paddrs[i*ntxqs + j] = di->idi_paddr;
4131		}
4132	}
4133	if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) {
4134		device_printf(ctx->ifc_dev, "device queue allocation failed\n");
4135		iflib_tx_structures_free(ctx);
4136		free(vaddrs, M_IFLIB);
4137		free(paddrs, M_IFLIB);
4138		goto err_rx_desc;
4139	}
4140	free(vaddrs, M_IFLIB);
4141	free(paddrs, M_IFLIB);
4142
4143	/* RXQs */
4144	vaddrs = malloc(sizeof(caddr_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
4145	paddrs = malloc(sizeof(uint64_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
4146	for (i = 0; i < nrxqsets; i++) {
4147		iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi;
4148
4149		for (j = 0; j < nrxqs; j++, di++) {
4150			vaddrs[i*nrxqs + j] = di->idi_vaddr;
4151			paddrs[i*nrxqs + j] = di->idi_paddr;
4152		}
4153	}
4154	if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) {
4155		device_printf(ctx->ifc_dev, "device queue allocation failed\n");
4156		iflib_tx_structures_free(ctx);
4157		free(vaddrs, M_IFLIB);
4158		free(paddrs, M_IFLIB);
4159		goto err_rx_desc;
4160	}
4161	free(vaddrs, M_IFLIB);
4162	free(paddrs, M_IFLIB);
4163
4164	return (0);
4165
4166/* XXX handle allocation failure changes */
4167err_rx_desc:
4168err_tx_desc:
4169	if (ctx->ifc_rxqs != NULL)
4170		free(ctx->ifc_rxqs, M_IFLIB);
4171	ctx->ifc_rxqs = NULL;
4172	if (ctx->ifc_txqs != NULL)
4173		free(ctx->ifc_txqs, M_IFLIB);
4174	ctx->ifc_txqs = NULL;
4175rx_fail:
4176	if (brscp != NULL)
4177		free(brscp, M_IFLIB);
4178	if (rxq != NULL)
4179		free(rxq, M_IFLIB);
4180	if (txq != NULL)
4181		free(txq, M_IFLIB);
4182fail:
4183	return (err);
4184}
4185
4186static int
4187iflib_tx_structures_setup(if_ctx_t ctx)
4188{
4189	iflib_txq_t txq = ctx->ifc_txqs;
4190	int i;
4191
4192	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
4193		iflib_txq_setup(txq);
4194
4195	return (0);
4196}
4197
4198static void
4199iflib_tx_structures_free(if_ctx_t ctx)
4200{
4201	iflib_txq_t txq = ctx->ifc_txqs;
4202	int i, j;
4203
4204	for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
4205		iflib_txq_destroy(txq);
4206		for (j = 0; j < ctx->ifc_nhwtxqs; j++)
4207			iflib_dma_free(&txq->ift_ifdi[j]);
4208	}
4209	free(ctx->ifc_txqs, M_IFLIB);
4210	ctx->ifc_txqs = NULL;
4211	IFDI_QUEUES_FREE(ctx);
4212}
4213
4214/*********************************************************************
4215 *
4216 *  Initialize all receive rings.
4217 *
4218 **********************************************************************/
4219static int
4220iflib_rx_structures_setup(if_ctx_t ctx)
4221{
4222	iflib_rxq_t rxq = ctx->ifc_rxqs;
4223	int q;
4224#if defined(INET6) || defined(INET)
4225	int i, err;
4226#endif
4227
4228	for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) {
4229#if defined(INET6) || defined(INET)
4230		tcp_lro_free(&rxq->ifr_lc);
4231		if ((err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp,
4232		    TCP_LRO_ENTRIES, min(1024,
4233		    ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset]))) != 0) {
4234			device_printf(ctx->ifc_dev, "LRO Initialization failed!\n");
4235			goto fail;
4236		}
4237		rxq->ifr_lro_enabled = TRUE;
4238#endif
4239		IFDI_RXQ_SETUP(ctx, rxq->ifr_id);
4240	}
4241	return (0);
4242#if defined(INET6) || defined(INET)
4243fail:
4244	/*
4245	 * Free RX software descriptors allocated so far, we will only handle
4246	 * the rings that completed, the failing case will have
4247	 * cleaned up for itself. 'q' failed, so its the terminus.
4248	 */
4249	rxq = ctx->ifc_rxqs;
4250	for (i = 0; i < q; ++i, rxq++) {
4251		iflib_rx_sds_free(rxq);
4252		rxq->ifr_cq_gen = rxq->ifr_cq_cidx = rxq->ifr_cq_pidx = 0;
4253	}
4254	return (err);
4255#endif
4256}
4257
4258/*********************************************************************
4259 *
4260 *  Free all receive rings.
4261 *
4262 **********************************************************************/
4263static void
4264iflib_rx_structures_free(if_ctx_t ctx)
4265{
4266	iflib_rxq_t rxq = ctx->ifc_rxqs;
4267
4268	for (int i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) {
4269		iflib_rx_sds_free(rxq);
4270	}
4271}
4272
4273static int
4274iflib_qset_structures_setup(if_ctx_t ctx)
4275{
4276	int err;
4277
4278	if ((err = iflib_tx_structures_setup(ctx)) != 0)
4279		return (err);
4280
4281	if ((err = iflib_rx_structures_setup(ctx)) != 0) {
4282		device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err);
4283		iflib_tx_structures_free(ctx);
4284		iflib_rx_structures_free(ctx);
4285	}
4286	return (err);
4287}
4288
4289int
4290iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
4291				driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, char *name)
4292{
4293
4294	return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
4295}
4296
4297static void
4298find_nth(if_ctx_t ctx, cpuset_t *cpus, int qid)
4299{
4300	int i, cpuid;
4301
4302	CPU_COPY(&ctx->ifc_cpus, cpus);
4303	/* clear up to the qid'th bit */
4304	for (i = 0; i < qid; i++) {
4305		cpuid = CPU_FFS(cpus);
4306		CPU_CLR(cpuid, cpus);
4307	}
4308}
4309
4310int
4311iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
4312						iflib_intr_type_t type, driver_filter_t *filter,
4313						void *filter_arg, int qid, char *name)
4314{
4315	struct grouptask *gtask;
4316	struct taskqgroup *tqg;
4317	iflib_filter_info_t info;
4318	cpuset_t cpus;
4319	gtask_fn_t *fn;
4320	int tqrid, err;
4321	void *q;
4322
4323	info = &ctx->ifc_filter_info;
4324
4325	switch (type) {
4326	/* XXX merge tx/rx for netmap? */
4327	case IFLIB_INTR_TX:
4328		q = &ctx->ifc_txqs[qid];
4329		info = &ctx->ifc_txqs[qid].ift_filter_info;
4330		gtask = &ctx->ifc_txqs[qid].ift_task;
4331		tqg = qgroup_softirq;
4332		tqrid = irq->ii_rid;
4333		fn = _task_fn_tx;
4334		break;
4335	case IFLIB_INTR_RX:
4336		q = &ctx->ifc_rxqs[qid];
4337		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
4338		gtask = &ctx->ifc_rxqs[qid].ifr_task;
4339		tqg = qgroup_softirq;
4340		tqrid = irq->ii_rid;
4341		fn = _task_fn_rx;
4342		break;
4343	case IFLIB_INTR_ADMIN:
4344		q = ctx;
4345		info = &ctx->ifc_filter_info;
4346		gtask = &ctx->ifc_admin_task;
4347		tqg = qgroup_if_config_tqg;
4348		tqrid = -1;
4349		fn = _task_fn_admin;
4350		break;
4351	default:
4352		panic("unknown net intr type");
4353	}
4354	GROUPTASK_INIT(gtask, 0, fn, q);
4355
4356	info->ifi_filter = filter;
4357	info->ifi_filter_arg = filter_arg;
4358	info->ifi_task = gtask;
4359
4360	/* XXX query cpu that rid belongs to */
4361
4362	err = _iflib_irq_alloc(ctx, irq, rid, iflib_fast_intr, NULL, info,  name);
4363	if (err != 0)
4364		return (err);
4365	if (tqrid != -1) {
4366		find_nth(ctx, &cpus, qid);
4367		taskqgroup_attach_cpu(tqg, gtask, q, CPU_FFS(&cpus), irq->ii_rid, name);
4368	} else
4369		taskqgroup_attach(tqg, gtask, q, tqrid, name);
4370
4371
4372	return (0);
4373}
4374
4375void
4376iflib_softirq_alloc_generic(if_ctx_t ctx, int rid, iflib_intr_type_t type,  void *arg, int qid, char *name)
4377{
4378	struct grouptask *gtask;
4379	struct taskqgroup *tqg;
4380	gtask_fn_t *fn;
4381	void *q;
4382
4383	switch (type) {
4384	case IFLIB_INTR_TX:
4385		q = &ctx->ifc_txqs[qid];
4386		gtask = &ctx->ifc_txqs[qid].ift_task;
4387		tqg = qgroup_softirq;
4388		fn = _task_fn_tx;
4389		break;
4390	case IFLIB_INTR_RX:
4391		q = &ctx->ifc_rxqs[qid];
4392		gtask = &ctx->ifc_rxqs[qid].ifr_task;
4393		tqg = qgroup_softirq;
4394		fn = _task_fn_rx;
4395		break;
4396	case IFLIB_INTR_ADMIN:
4397		q = ctx;
4398		gtask = &ctx->ifc_admin_task;
4399		tqg = qgroup_if_config_tqg;
4400		rid = -1;
4401		fn = _task_fn_admin;
4402		break;
4403	case IFLIB_INTR_IOV:
4404		q = ctx;
4405		gtask = &ctx->ifc_vflr_task;
4406		tqg = qgroup_if_config_tqg;
4407		rid = -1;
4408		fn = _task_fn_iov;
4409		break;
4410	default:
4411		panic("unknown net intr type");
4412	}
4413	GROUPTASK_INIT(gtask, 0, fn, q);
4414	taskqgroup_attach(tqg, gtask, q, rid, name);
4415}
4416
4417void
4418iflib_irq_free(if_ctx_t ctx, if_irq_t irq)
4419{
4420	if (irq->ii_tag)
4421		bus_teardown_intr(ctx->ifc_dev, irq->ii_res, irq->ii_tag);
4422
4423	if (irq->ii_res)
4424		bus_release_resource(ctx->ifc_dev, SYS_RES_IRQ, irq->ii_rid, irq->ii_res);
4425}
4426
4427static int
4428iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filter_arg, int *rid, char *name)
4429{
4430	iflib_txq_t txq = ctx->ifc_txqs;
4431	iflib_rxq_t rxq = ctx->ifc_rxqs;
4432	if_irq_t irq = &ctx->ifc_legacy_irq;
4433	iflib_filter_info_t info;
4434	struct grouptask *gtask;
4435	struct taskqgroup *tqg;
4436	gtask_fn_t *fn;
4437	int tqrid;
4438	void *q;
4439	int err;
4440
4441	q = &ctx->ifc_rxqs[0];
4442	info = &rxq[0].ifr_filter_info;
4443	gtask = &rxq[0].ifr_task;
4444	tqg = qgroup_softirq;
4445	tqrid = irq->ii_rid = *rid;
4446	fn = _task_fn_rx;
4447
4448	ctx->ifc_flags |= IFC_LEGACY;
4449	info->ifi_filter = filter;
4450	info->ifi_filter_arg = filter_arg;
4451	info->ifi_task = gtask;
4452
4453	/* We allocate a single interrupt resource */
4454	if ((err = _iflib_irq_alloc(ctx, irq, tqrid, iflib_fast_intr, NULL, info, name)) != 0)
4455		return (err);
4456	GROUPTASK_INIT(gtask, 0, fn, q);
4457	taskqgroup_attach(tqg, gtask, q, tqrid, name);
4458
4459	GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq);
4460	taskqgroup_attach(qgroup_softirq, &txq->ift_task, txq, tqrid, "tx");
4461	GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
4462	taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx, -1, "admin/link");
4463
4464	return (0);
4465}
4466
4467void
4468iflib_led_create(if_ctx_t ctx)
4469{
4470
4471	ctx->ifc_led_dev = led_create(iflib_led_func, ctx,
4472								  device_get_nameunit(ctx->ifc_dev));
4473}
4474
4475void
4476iflib_tx_intr_deferred(if_ctx_t ctx, int txqid)
4477{
4478
4479	GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task);
4480}
4481
4482void
4483iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid)
4484{
4485
4486	GROUPTASK_ENQUEUE(&ctx->ifc_rxqs[rxqid].ifr_task);
4487}
4488
4489void
4490iflib_admin_intr_deferred(if_ctx_t ctx)
4491{
4492
4493	GROUPTASK_ENQUEUE(&ctx->ifc_admin_task);
4494}
4495
4496void
4497iflib_iov_intr_deferred(if_ctx_t ctx)
4498{
4499
4500	GROUPTASK_ENQUEUE(&ctx->ifc_vflr_task);
4501}
4502
4503void
4504iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, char *name)
4505{
4506
4507	taskqgroup_attach_cpu(qgroup_softirq, gt, uniq, cpu, -1, name);
4508}
4509
4510void
4511iflib_config_gtask_init(if_ctx_t ctx, struct grouptask *gtask, gtask_fn_t *fn,
4512	char *name)
4513{
4514
4515	GROUPTASK_INIT(gtask, 0, fn, ctx);
4516	taskqgroup_attach(qgroup_if_config_tqg, gtask, gtask, -1, name);
4517}
4518
4519void
4520iflib_config_gtask_deinit(struct grouptask *gtask)
4521{
4522
4523	taskqgroup_detach(qgroup_if_config_tqg, gtask);
4524}
4525
4526void
4527iflib_link_state_change(if_ctx_t ctx, int link_state, uint64_t baudrate)
4528{
4529	if_t ifp = ctx->ifc_ifp;
4530	iflib_txq_t txq = ctx->ifc_txqs;
4531
4532
4533	if_setbaudrate(ifp, baudrate);
4534
4535	/* If link down, disable watchdog */
4536	if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) {
4537		for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxqsets; i++, txq++)
4538			txq->ift_qstatus = IFLIB_QUEUE_IDLE;
4539	}
4540	ctx->ifc_link_state = link_state;
4541	if_link_state_change(ifp, link_state);
4542}
4543
4544static int
4545iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq)
4546{
4547	int credits;
4548
4549	if (ctx->isc_txd_credits_update == NULL)
4550		return (0);
4551
4552	if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, txq->ift_cidx_processed, true)) == 0)
4553		return (0);
4554
4555	txq->ift_processed += credits;
4556	txq->ift_cidx_processed += credits;
4557
4558	if (txq->ift_cidx_processed >= txq->ift_size)
4559		txq->ift_cidx_processed -= txq->ift_size;
4560	return (credits);
4561}
4562
4563static int
4564iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, int cidx, int budget)
4565{
4566
4567	return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx,
4568	    budget));
4569}
4570
4571void
4572iflib_add_int_delay_sysctl(if_ctx_t ctx, const char *name,
4573	const char *description, if_int_delay_info_t info,
4574	int offset, int value)
4575{
4576	info->iidi_ctx = ctx;
4577	info->iidi_offset = offset;
4578	info->iidi_value = value;
4579	SYSCTL_ADD_PROC(device_get_sysctl_ctx(ctx->ifc_dev),
4580	    SYSCTL_CHILDREN(device_get_sysctl_tree(ctx->ifc_dev)),
4581	    OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW,
4582	    info, 0, iflib_sysctl_int_delay, "I", description);
4583}
4584
4585struct mtx *
4586iflib_ctx_lock_get(if_ctx_t ctx)
4587{
4588
4589	return (&ctx->ifc_mtx);
4590}
4591
4592static int
4593iflib_msix_init(if_ctx_t ctx)
4594{
4595	device_t dev = ctx->ifc_dev;
4596	if_shared_ctx_t sctx = ctx->ifc_sctx;
4597	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
4598	int vectors, queues, rx_queues, tx_queues, queuemsgs, msgs;
4599	int iflib_num_tx_queues, iflib_num_rx_queues;
4600	int err, admincnt, bar;
4601
4602	iflib_num_tx_queues = scctx->isc_ntxqsets;
4603	iflib_num_rx_queues = scctx->isc_nrxqsets;
4604
4605	bar = ctx->ifc_softc_ctx.isc_msix_bar;
4606	admincnt = sctx->isc_admin_intrcnt;
4607	/* Override by tuneable */
4608	if (enable_msix == 0)
4609		goto msi;
4610
4611	/*
4612	** When used in a virtualized environment
4613	** PCI BUSMASTER capability may not be set
4614	** so explicity set it here and rewrite
4615	** the ENABLE in the MSIX control register
4616	** at this point to cause the host to
4617	** successfully initialize us.
4618	*/
4619	{
4620		uint16_t pci_cmd_word;
4621		int msix_ctrl, rid;
4622
4623		rid = 0;
4624		pci_cmd_word = pci_read_config(dev, PCIR_COMMAND, 2);
4625		pci_cmd_word |= PCIM_CMD_BUSMASTEREN;
4626		pci_write_config(dev, PCIR_COMMAND, pci_cmd_word, 2);
4627		pci_find_cap(dev, PCIY_MSIX, &rid);
4628		rid += PCIR_MSIX_CTRL;
4629		msix_ctrl = pci_read_config(dev, rid, 2);
4630		msix_ctrl |= PCIM_MSIXCTRL_MSIX_ENABLE;
4631		pci_write_config(dev, rid, msix_ctrl, 2);
4632	}
4633
4634	/*
4635	 * bar == -1 => "trust me I know what I'm doing"
4636	 * Some drivers are for hardware that is so shoddily
4637	 * documented that no one knows which bars are which
4638	 * so the developer has to map all bars. This hack
4639	 * allows shoddy garbage to use msix in this framework.
4640	 */
4641	if (bar != -1) {
4642		ctx->ifc_msix_mem = bus_alloc_resource_any(dev,
4643	            SYS_RES_MEMORY, &bar, RF_ACTIVE);
4644		if (ctx->ifc_msix_mem == NULL) {
4645			/* May not be enabled */
4646			device_printf(dev, "Unable to map MSIX table \n");
4647			goto msi;
4648		}
4649	}
4650	/* First try MSI/X */
4651	if ((msgs = pci_msix_count(dev)) == 0) { /* system has msix disabled */
4652		device_printf(dev, "System has MSIX disabled \n");
4653		bus_release_resource(dev, SYS_RES_MEMORY,
4654		    bar, ctx->ifc_msix_mem);
4655		ctx->ifc_msix_mem = NULL;
4656		goto msi;
4657	}
4658#if IFLIB_DEBUG
4659	/* use only 1 qset in debug mode */
4660	queuemsgs = min(msgs - admincnt, 1);
4661#else
4662	queuemsgs = msgs - admincnt;
4663#endif
4664	if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) == 0) {
4665#ifdef RSS
4666		queues = imin(queuemsgs, rss_getnumbuckets());
4667#else
4668		queues = queuemsgs;
4669#endif
4670		queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues);
4671		device_printf(dev, "pxm cpus: %d queue msgs: %d admincnt: %d\n",
4672					  CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt);
4673	} else {
4674		device_printf(dev, "Unable to fetch CPU list\n");
4675		/* Figure out a reasonable auto config value */
4676		queues = min(queuemsgs, mp_ncpus);
4677	}
4678#ifdef  RSS
4679	/* If we're doing RSS, clamp at the number of RSS buckets */
4680	if (queues > rss_getnumbuckets())
4681		queues = rss_getnumbuckets();
4682#endif
4683	if (iflib_num_rx_queues > 0 && iflib_num_rx_queues < queuemsgs - admincnt)
4684		rx_queues = iflib_num_rx_queues;
4685	else
4686		rx_queues = queues;
4687	/*
4688	 * We want this to be all logical CPUs by default
4689	 */
4690	if (iflib_num_tx_queues > 0 && iflib_num_tx_queues < queues)
4691		tx_queues = iflib_num_tx_queues;
4692	else
4693		tx_queues = mp_ncpus;
4694
4695	if (ctx->ifc_sysctl_qs_eq_override == 0) {
4696#ifdef INVARIANTS
4697		if (tx_queues != rx_queues)
4698			device_printf(dev, "queue equality override not set, capping rx_queues at %d and tx_queues at %d\n",
4699				      min(rx_queues, tx_queues), min(rx_queues, tx_queues));
4700#endif
4701		tx_queues = min(rx_queues, tx_queues);
4702		rx_queues = min(rx_queues, tx_queues);
4703	}
4704
4705	device_printf(dev, "using %d rx queues %d tx queues \n", rx_queues, tx_queues);
4706
4707	vectors = rx_queues + admincnt;
4708	if ((err = pci_alloc_msix(dev, &vectors)) == 0) {
4709		device_printf(dev,
4710					  "Using MSIX interrupts with %d vectors\n", vectors);
4711		scctx->isc_vectors = vectors;
4712		scctx->isc_nrxqsets = rx_queues;
4713		scctx->isc_ntxqsets = tx_queues;
4714		scctx->isc_intr = IFLIB_INTR_MSIX;
4715
4716		return (vectors);
4717	} else {
4718		device_printf(dev, "failed to allocate %d msix vectors, err: %d - using MSI\n", vectors, err);
4719	}
4720msi:
4721	vectors = pci_msi_count(dev);
4722	scctx->isc_nrxqsets = 1;
4723	scctx->isc_ntxqsets = 1;
4724	scctx->isc_vectors = vectors;
4725	if (vectors == 1 && pci_alloc_msi(dev, &vectors) == 0) {
4726		device_printf(dev,"Using an MSI interrupt\n");
4727		scctx->isc_intr = IFLIB_INTR_MSI;
4728	} else {
4729		device_printf(dev,"Using a Legacy interrupt\n");
4730		scctx->isc_intr = IFLIB_INTR_LEGACY;
4731	}
4732
4733	return (vectors);
4734}
4735
4736char * ring_states[] = { "IDLE", "BUSY", "STALLED", "ABDICATED" };
4737
4738static int
4739mp_ring_state_handler(SYSCTL_HANDLER_ARGS)
4740{
4741	int rc;
4742	uint16_t *state = ((uint16_t *)oidp->oid_arg1);
4743	struct sbuf *sb;
4744	char *ring_state = "UNKNOWN";
4745
4746	/* XXX needed ? */
4747	rc = sysctl_wire_old_buffer(req, 0);
4748	MPASS(rc == 0);
4749	if (rc != 0)
4750		return (rc);
4751	sb = sbuf_new_for_sysctl(NULL, NULL, 80, req);
4752	MPASS(sb != NULL);
4753	if (sb == NULL)
4754		return (ENOMEM);
4755	if (state[3] <= 3)
4756		ring_state = ring_states[state[3]];
4757
4758	sbuf_printf(sb, "pidx_head: %04hd pidx_tail: %04hd cidx: %04hd state: %s",
4759		    state[0], state[1], state[2], ring_state);
4760	rc = sbuf_finish(sb);
4761	sbuf_delete(sb);
4762        return(rc);
4763}
4764
4765enum iflib_ndesc_handler {
4766	IFLIB_NTXD_HANDLER,
4767	IFLIB_NRXD_HANDLER,
4768};
4769
4770static int
4771mp_ndesc_handler(SYSCTL_HANDLER_ARGS)
4772{
4773	if_ctx_t ctx = (void *)arg1;
4774	enum iflib_ndesc_handler type = arg2;
4775	char buf[256] = {0};
4776	uint16_t *ndesc;
4777	char *p, *next;
4778	int nqs, rc, i;
4779
4780	MPASS(type == IFLIB_NTXD_HANDLER || type == IFLIB_NRXD_HANDLER);
4781
4782	nqs = 8;
4783	switch(type) {
4784	case IFLIB_NTXD_HANDLER:
4785		ndesc = ctx->ifc_sysctl_ntxds;
4786		if (ctx->ifc_sctx)
4787			nqs = ctx->ifc_sctx->isc_ntxqs;
4788		break;
4789	case IFLIB_NRXD_HANDLER:
4790		ndesc = ctx->ifc_sysctl_nrxds;
4791		if (ctx->ifc_sctx)
4792			nqs = ctx->ifc_sctx->isc_nrxqs;
4793		break;
4794	}
4795	if (nqs == 0)
4796		nqs = 8;
4797
4798	for (i=0; i<8; i++) {
4799		if (i >= nqs)
4800			break;
4801		if (i)
4802			strcat(buf, ",");
4803		sprintf(strchr(buf, 0), "%d", ndesc[i]);
4804	}
4805
4806	rc = sysctl_handle_string(oidp, buf, sizeof(buf), req);
4807	if (rc || req->newptr == NULL)
4808		return rc;
4809
4810	for (i = 0, next = buf, p = strsep(&next, " ,"); i < 8 && p;
4811	    i++, p = strsep(&next, " ,")) {
4812		ndesc[i] = strtoul(p, NULL, 10);
4813	}
4814
4815	return(rc);
4816}
4817
4818#define NAME_BUFLEN 32
4819static void
4820iflib_add_device_sysctl_pre(if_ctx_t ctx)
4821{
4822        device_t dev = iflib_get_dev(ctx);
4823	struct sysctl_oid_list *child, *oid_list;
4824	struct sysctl_ctx_list *ctx_list;
4825	struct sysctl_oid *node;
4826
4827	ctx_list = device_get_sysctl_ctx(dev);
4828	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4829	ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, "iflib",
4830						      CTLFLAG_RD, NULL, "IFLIB fields");
4831	oid_list = SYSCTL_CHILDREN(node);
4832
4833	SYSCTL_ADD_STRING(ctx_list, oid_list, OID_AUTO, "driver_version",
4834		       CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version, 0,
4835		       "driver version");
4836
4837	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs",
4838		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0,
4839			"# of txqs to use, 0 => use default #");
4840	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs",
4841		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0,
4842			"# of rxqs to use, 0 => use default #");
4843	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable",
4844		       CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0,
4845                       "permit #txq != #rxq");
4846
4847	/* XXX change for per-queue sizes */
4848	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
4849		       CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NTXD_HANDLER,
4850                       mp_ndesc_handler, "A",
4851                       "list of # of tx descriptors to use, 0 = use default #");
4852	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds",
4853		       CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NRXD_HANDLER,
4854                       mp_ndesc_handler, "A",
4855                       "list of # of rx descriptors to use, 0 = use default #");
4856}
4857
4858static void
4859iflib_add_device_sysctl_post(if_ctx_t ctx)
4860{
4861	if_shared_ctx_t sctx = ctx->ifc_sctx;
4862	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
4863        device_t dev = iflib_get_dev(ctx);
4864	struct sysctl_oid_list *child;
4865	struct sysctl_ctx_list *ctx_list;
4866	iflib_fl_t fl;
4867	iflib_txq_t txq;
4868	iflib_rxq_t rxq;
4869	int i, j;
4870	char namebuf[NAME_BUFLEN];
4871	char *qfmt;
4872	struct sysctl_oid *queue_node, *fl_node, *node;
4873	struct sysctl_oid_list *queue_list, *fl_list;
4874	ctx_list = device_get_sysctl_ctx(dev);
4875
4876	node = ctx->ifc_sysctl_node;
4877	child = SYSCTL_CHILDREN(node);
4878
4879	if (scctx->isc_ntxqsets > 100)
4880		qfmt = "txq%03d";
4881	else if (scctx->isc_ntxqsets > 10)
4882		qfmt = "txq%02d";
4883	else
4884		qfmt = "txq%d";
4885	for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) {
4886		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
4887		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
4888					     CTLFLAG_RD, NULL, "Queue Name");
4889		queue_list = SYSCTL_CHILDREN(queue_node);
4890#if MEMORY_LOGGING
4891		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued",
4892				CTLFLAG_RD,
4893				&txq->ift_dequeued, "total mbufs freed");
4894		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_enqueued",
4895				CTLFLAG_RD,
4896				&txq->ift_enqueued, "total mbufs enqueued");
4897#endif
4898		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag",
4899				   CTLFLAG_RD,
4900				   &txq->ift_mbuf_defrag, "# of times m_defrag was called");
4901		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "m_pullups",
4902				   CTLFLAG_RD,
4903				   &txq->ift_pullups, "# of times m_pullup was called");
4904		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag_failed",
4905				   CTLFLAG_RD,
4906				   &txq->ift_mbuf_defrag_failed, "# of times m_defrag failed");
4907		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_desc_avail",
4908				   CTLFLAG_RD,
4909				   &txq->ift_no_desc_avail, "# of times no descriptors were available");
4910		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "tx_map_failed",
4911				   CTLFLAG_RD,
4912				   &txq->ift_map_failed, "# of times dma map failed");
4913		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txd_encap_efbig",
4914				   CTLFLAG_RD,
4915				   &txq->ift_txd_encap_efbig, "# of times txd_encap returned EFBIG");
4916		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_tx_dma_setup",
4917				   CTLFLAG_RD,
4918				   &txq->ift_no_tx_dma_setup, "# of times map failed for other than EFBIG");
4919		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_pidx",
4920				   CTLFLAG_RD,
4921				   &txq->ift_pidx, 1, "Producer Index");
4922		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx",
4923				   CTLFLAG_RD,
4924				   &txq->ift_cidx, 1, "Consumer Index");
4925		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx_processed",
4926				   CTLFLAG_RD,
4927				   &txq->ift_cidx_processed, 1, "Consumer Index seen by credit update");
4928		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_in_use",
4929				   CTLFLAG_RD,
4930				   &txq->ift_in_use, 1, "descriptors in use");
4931		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_processed",
4932				   CTLFLAG_RD,
4933				   &txq->ift_processed, "descriptors procesed for clean");
4934		SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_cleaned",
4935				   CTLFLAG_RD,
4936				   &txq->ift_cleaned, "total cleaned");
4937		SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state",
4938				CTLTYPE_STRING | CTLFLAG_RD, __DEVOLATILE(uint64_t *, &txq->ift_br[0]->state),
4939				0, mp_ring_state_handler, "A", "soft ring state");
4940		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_enqueues",
4941				       CTLFLAG_RD, &txq->ift_br[0]->enqueues,
4942				       "# of enqueues to the mp_ring for this queue");
4943		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_drops",
4944				       CTLFLAG_RD, &txq->ift_br[0]->drops,
4945				       "# of drops in the mp_ring for this queue");
4946		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_starts",
4947				       CTLFLAG_RD, &txq->ift_br[0]->starts,
4948				       "# of normal consumer starts in the mp_ring for this queue");
4949		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_stalls",
4950				       CTLFLAG_RD, &txq->ift_br[0]->stalls,
4951					       "# of consumer stalls in the mp_ring for this queue");
4952		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_restarts",
4953			       CTLFLAG_RD, &txq->ift_br[0]->restarts,
4954				       "# of consumer restarts in the mp_ring for this queue");
4955		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_abdications",
4956				       CTLFLAG_RD, &txq->ift_br[0]->abdications,
4957				       "# of consumer abdications in the mp_ring for this queue");
4958
4959	}
4960
4961	if (scctx->isc_nrxqsets > 100)
4962		qfmt = "rxq%03d";
4963	else if (scctx->isc_nrxqsets > 10)
4964		qfmt = "rxq%02d";
4965	else
4966		qfmt = "rxq%d";
4967	for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) {
4968		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
4969		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
4970					     CTLFLAG_RD, NULL, "Queue Name");
4971		queue_list = SYSCTL_CHILDREN(queue_node);
4972		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
4973			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_pidx",
4974				       CTLFLAG_RD,
4975				       &rxq->ifr_cq_pidx, 1, "Producer Index");
4976			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx",
4977				       CTLFLAG_RD,
4978				       &rxq->ifr_cq_cidx, 1, "Consumer Index");
4979		}
4980		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
4981			snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j);
4982			fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list, OID_AUTO, namebuf,
4983						     CTLFLAG_RD, NULL, "freelist Name");
4984			fl_list = SYSCTL_CHILDREN(fl_node);
4985			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "pidx",
4986				       CTLFLAG_RD,
4987				       &fl->ifl_pidx, 1, "Producer Index");
4988			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "cidx",
4989				       CTLFLAG_RD,
4990				       &fl->ifl_cidx, 1, "Consumer Index");
4991			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "credits",
4992				       CTLFLAG_RD,
4993				       &fl->ifl_credits, 1, "credits available");
4994#if MEMORY_LOGGING
4995			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_enqueued",
4996					CTLFLAG_RD,
4997					&fl->ifl_m_enqueued, "mbufs allocated");
4998			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_dequeued",
4999					CTLFLAG_RD,
5000					&fl->ifl_m_dequeued, "mbufs freed");
5001			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_enqueued",
5002					CTLFLAG_RD,
5003					&fl->ifl_cl_enqueued, "clusters allocated");
5004			SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_dequeued",
5005					CTLFLAG_RD,
5006					&fl->ifl_cl_dequeued, "clusters freed");
5007#endif
5008
5009		}
5010	}
5011
5012}
5013