if_mxge.c revision 254263
1/******************************************************************************
2
3Copyright (c) 2006-2013, Myricom Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Myricom Inc, nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/mxge/if_mxge.c 254263 2013-08-12 23:30:01Z scottl $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/linker.h>
36#include <sys/firmware.h>
37#include <sys/endian.h>
38#include <sys/sockio.h>
39#include <sys/mbuf.h>
40#include <sys/malloc.h>
41#include <sys/kdb.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/module.h>
45#include <sys/socket.h>
46#include <sys/sysctl.h>
47#include <sys/sx.h>
48#include <sys/taskqueue.h>
49
50#include <net/if.h>
51#include <net/if_arp.h>
52#include <net/ethernet.h>
53#include <net/if_dl.h>
54#include <net/if_media.h>
55
56#include <net/bpf.h>
57
58#include <net/if_types.h>
59#include <net/if_vlan_var.h>
60#include <net/zlib.h>
61
62#include <netinet/in_systm.h>
63#include <netinet/in.h>
64#include <netinet/ip.h>
65#include <netinet/ip6.h>
66#include <netinet/tcp.h>
67#include <netinet/tcp_lro.h>
68#include <netinet6/ip6_var.h>
69
70#include <machine/bus.h>
71#include <machine/in_cksum.h>
72#include <machine/resource.h>
73#include <sys/bus.h>
74#include <sys/rman.h>
75#include <sys/smp.h>
76
77#include <dev/pci/pcireg.h>
78#include <dev/pci/pcivar.h>
79#include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
80
81#include <vm/vm.h>		/* for pmap_mapdev() */
82#include <vm/pmap.h>
83
84#if defined(__i386) || defined(__amd64)
85#include <machine/specialreg.h>
86#endif
87
88#include <dev/mxge/mxge_mcp.h>
89#include <dev/mxge/mcp_gen_header.h>
90/*#define MXGE_FAKE_IFP*/
91#include <dev/mxge/if_mxge_var.h>
92#ifdef IFNET_BUF_RING
93#include <sys/buf_ring.h>
94#endif
95
96#include "opt_inet.h"
97#include "opt_inet6.h"
98
99/* tunable params */
100static int mxge_nvidia_ecrc_enable = 1;
101static int mxge_force_firmware = 0;
102static int mxge_intr_coal_delay = 30;
103static int mxge_deassert_wait = 1;
104static int mxge_flow_control = 1;
105static int mxge_verbose = 0;
106static int mxge_ticks;
107static int mxge_max_slices = 1;
108static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
109static int mxge_always_promisc = 0;
110static int mxge_initial_mtu = ETHERMTU_JUMBO;
111static int mxge_throttle = 0;
112static char *mxge_fw_unaligned = "mxge_ethp_z8e";
113static char *mxge_fw_aligned = "mxge_eth_z8e";
114static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
115static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
116
117static int mxge_probe(device_t dev);
118static int mxge_attach(device_t dev);
119static int mxge_detach(device_t dev);
120static int mxge_shutdown(device_t dev);
121static void mxge_intr(void *arg);
122
123static device_method_t mxge_methods[] =
124{
125  /* Device interface */
126  DEVMETHOD(device_probe, mxge_probe),
127  DEVMETHOD(device_attach, mxge_attach),
128  DEVMETHOD(device_detach, mxge_detach),
129  DEVMETHOD(device_shutdown, mxge_shutdown),
130
131  DEVMETHOD_END
132};
133
134static driver_t mxge_driver =
135{
136  "mxge",
137  mxge_methods,
138  sizeof(mxge_softc_t),
139};
140
141static devclass_t mxge_devclass;
142
143/* Declare ourselves to be a child of the PCI bus.*/
144DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
145MODULE_DEPEND(mxge, firmware, 1, 1, 1);
146MODULE_DEPEND(mxge, zlib, 1, 1, 1);
147
148static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
149static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
150static int mxge_close(mxge_softc_t *sc, int down);
151static int mxge_open(mxge_softc_t *sc);
152static void mxge_tick(void *arg);
153
154static int
155mxge_probe(device_t dev)
156{
157	int rev;
158
159
160	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
161	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
162	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
163		rev = pci_get_revid(dev);
164		switch (rev) {
165		case MXGE_PCI_REV_Z8E:
166			device_set_desc(dev, "Myri10G-PCIE-8A");
167			break;
168		case MXGE_PCI_REV_Z8ES:
169			device_set_desc(dev, "Myri10G-PCIE-8B");
170			break;
171		default:
172			device_set_desc(dev, "Myri10G-PCIE-8??");
173			device_printf(dev, "Unrecognized rev %d NIC\n",
174				      rev);
175			break;
176		}
177		return 0;
178	}
179	return ENXIO;
180}
181
182static void
183mxge_enable_wc(mxge_softc_t *sc)
184{
185#if defined(__i386) || defined(__amd64)
186	vm_offset_t len;
187	int err;
188
189	sc->wc = 1;
190	len = rman_get_size(sc->mem_res);
191	err = pmap_change_attr((vm_offset_t) sc->sram,
192			       len, PAT_WRITE_COMBINING);
193	if (err != 0) {
194		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
195			      err);
196		sc->wc = 0;
197	}
198#endif
199}
200
201
202/* callback to get our DMA address */
203static void
204mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
205			 int error)
206{
207	if (error == 0) {
208		*(bus_addr_t *) arg = segs->ds_addr;
209	}
210}
211
212static int
213mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
214		   bus_size_t alignment)
215{
216	int err;
217	device_t dev = sc->dev;
218	bus_size_t boundary, maxsegsize;
219
220	if (bytes > 4096 && alignment == 4096) {
221		boundary = 0;
222		maxsegsize = bytes;
223	} else {
224		boundary = 4096;
225		maxsegsize = 4096;
226	}
227
228	/* allocate DMAable memory tags */
229	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
230				 alignment,		/* alignment */
231				 boundary,		/* boundary */
232				 BUS_SPACE_MAXADDR,	/* low */
233				 BUS_SPACE_MAXADDR,	/* high */
234				 NULL, NULL,		/* filter */
235				 bytes,			/* maxsize */
236				 1,			/* num segs */
237				 maxsegsize,		/* maxsegsize */
238				 BUS_DMA_COHERENT,	/* flags */
239				 NULL, NULL,		/* lock */
240				 &dma->dmat);		/* tag */
241	if (err != 0) {
242		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
243		return err;
244	}
245
246	/* allocate DMAable memory & map */
247	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
248			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
249				| BUS_DMA_ZERO),  &dma->map);
250	if (err != 0) {
251		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
252		goto abort_with_dmat;
253	}
254
255	/* load the memory */
256	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
257			      mxge_dmamap_callback,
258			      (void *)&dma->bus_addr, 0);
259	if (err != 0) {
260		device_printf(dev, "couldn't load map (err = %d)\n", err);
261		goto abort_with_mem;
262	}
263	return 0;
264
265abort_with_mem:
266	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
267abort_with_dmat:
268	(void)bus_dma_tag_destroy(dma->dmat);
269	return err;
270}
271
272
273static void
274mxge_dma_free(mxge_dma_t *dma)
275{
276	bus_dmamap_unload(dma->dmat, dma->map);
277	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
278	(void)bus_dma_tag_destroy(dma->dmat);
279}
280
281/*
282 * The eeprom strings on the lanaiX have the format
283 * SN=x\0
284 * MAC=x:x:x:x:x:x\0
285 * PC=text\0
286 */
287
288static int
289mxge_parse_strings(mxge_softc_t *sc)
290{
291	char *ptr;
292	int i, found_mac, found_sn2;
293	char *endptr;
294
295	ptr = sc->eeprom_strings;
296	found_mac = 0;
297	found_sn2 = 0;
298	while (*ptr != '\0') {
299		if (strncmp(ptr, "MAC=", 4) == 0) {
300			ptr += 4;
301			for (i = 0;;) {
302				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
303				if (endptr - ptr != 2)
304					goto abort;
305				ptr = endptr;
306				if (++i == 6)
307					break;
308				if (*ptr++ != ':')
309					goto abort;
310			}
311			found_mac = 1;
312		} else if (strncmp(ptr, "PC=", 3) == 0) {
313			ptr += 3;
314			strlcpy(sc->product_code_string, ptr,
315			    sizeof(sc->product_code_string));
316		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
317			ptr += 3;
318			strlcpy(sc->serial_number_string, ptr,
319			    sizeof(sc->serial_number_string));
320		} else if (strncmp(ptr, "SN2=", 4) == 0) {
321			/* SN2 takes precedence over SN */
322			ptr += 4;
323			found_sn2 = 1;
324			strlcpy(sc->serial_number_string, ptr,
325			    sizeof(sc->serial_number_string));
326		}
327		while (*ptr++ != '\0') {}
328	}
329
330	if (found_mac)
331		return 0;
332
333 abort:
334	device_printf(sc->dev, "failed to parse eeprom_strings\n");
335
336	return ENXIO;
337}
338
339#if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
340static void
341mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
342{
343	uint32_t val;
344	unsigned long base, off;
345	char *va, *cfgptr;
346	device_t pdev, mcp55;
347	uint16_t vendor_id, device_id, word;
348	uintptr_t bus, slot, func, ivend, idev;
349	uint32_t *ptr32;
350
351
352	if (!mxge_nvidia_ecrc_enable)
353		return;
354
355	pdev = device_get_parent(device_get_parent(sc->dev));
356	if (pdev == NULL) {
357		device_printf(sc->dev, "could not find parent?\n");
358		return;
359	}
360	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
361	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
362
363	if (vendor_id != 0x10de)
364		return;
365
366	base = 0;
367
368	if (device_id == 0x005d) {
369		/* ck804, base address is magic */
370		base = 0xe0000000UL;
371	} else if (device_id >= 0x0374 && device_id <= 0x378) {
372		/* mcp55, base address stored in chipset */
373		mcp55 = pci_find_bsf(0, 0, 0);
374		if (mcp55 &&
375		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
376		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
377			word = pci_read_config(mcp55, 0x90, 2);
378			base = ((unsigned long)word & 0x7ffeU) << 25;
379		}
380	}
381	if (!base)
382		return;
383
384	/* XXXX
385	   Test below is commented because it is believed that doing
386	   config read/write beyond 0xff will access the config space
387	   for the next larger function.  Uncomment this and remove
388	   the hacky pmap_mapdev() way of accessing config space when
389	   FreeBSD grows support for extended pcie config space access
390	*/
391#if 0
392	/* See if we can, by some miracle, access the extended
393	   config space */
394	val = pci_read_config(pdev, 0x178, 4);
395	if (val != 0xffffffff) {
396		val |= 0x40;
397		pci_write_config(pdev, 0x178, val, 4);
398		return;
399	}
400#endif
401	/* Rather than using normal pci config space writes, we must
402	 * map the Nvidia config space ourselves.  This is because on
403	 * opteron/nvidia class machine the 0xe000000 mapping is
404	 * handled by the nvidia chipset, that means the internal PCI
405	 * device (the on-chip northbridge), or the amd-8131 bridge
406	 * and things behind them are not visible by this method.
407	 */
408
409	BUS_READ_IVAR(device_get_parent(pdev), pdev,
410		      PCI_IVAR_BUS, &bus);
411	BUS_READ_IVAR(device_get_parent(pdev), pdev,
412		      PCI_IVAR_SLOT, &slot);
413	BUS_READ_IVAR(device_get_parent(pdev), pdev,
414		      PCI_IVAR_FUNCTION, &func);
415	BUS_READ_IVAR(device_get_parent(pdev), pdev,
416		      PCI_IVAR_VENDOR, &ivend);
417	BUS_READ_IVAR(device_get_parent(pdev), pdev,
418		      PCI_IVAR_DEVICE, &idev);
419
420	off =  base
421		+ 0x00100000UL * (unsigned long)bus
422		+ 0x00001000UL * (unsigned long)(func
423						 + 8 * slot);
424
425	/* map it into the kernel */
426	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
427
428
429	if (va == NULL) {
430		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
431		return;
432	}
433	/* get a pointer to the config space mapped into the kernel */
434	cfgptr = va + (off & PAGE_MASK);
435
436	/* make sure that we can really access it */
437	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
438	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
439	if (! (vendor_id == ivend && device_id == idev)) {
440		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
441			      vendor_id, device_id);
442		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
443		return;
444	}
445
446	ptr32 = (uint32_t*)(cfgptr + 0x178);
447	val = *ptr32;
448
449	if (val == 0xffffffff) {
450		device_printf(sc->dev, "extended mapping failed\n");
451		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
452		return;
453	}
454	*ptr32 = val | 0x40;
455	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
456	if (mxge_verbose)
457		device_printf(sc->dev,
458			      "Enabled ECRC on upstream Nvidia bridge "
459			      "at %d:%d:%d\n",
460			      (int)bus, (int)slot, (int)func);
461	return;
462}
463#else
464static void
465mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
466{
467	device_printf(sc->dev,
468		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
469	return;
470}
471#endif
472
473
474static int
475mxge_dma_test(mxge_softc_t *sc, int test_type)
476{
477	mxge_cmd_t cmd;
478	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
479	int status;
480	uint32_t len;
481	char *test = " ";
482
483
484	/* Run a small DMA test.
485	 * The magic multipliers to the length tell the firmware
486	 * to do DMA read, write, or read+write tests.  The
487	 * results are returned in cmd.data0.  The upper 16
488	 * bits of the return is the number of transfers completed.
489	 * The lower 16 bits is the time in 0.5us ticks that the
490	 * transfers took to complete.
491	 */
492
493	len = sc->tx_boundary;
494
495	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
496	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
497	cmd.data2 = len * 0x10000;
498	status = mxge_send_cmd(sc, test_type, &cmd);
499	if (status != 0) {
500		test = "read";
501		goto abort;
502	}
503	sc->read_dma = ((cmd.data0>>16) * len * 2) /
504		(cmd.data0 & 0xffff);
505	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
506	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
507	cmd.data2 = len * 0x1;
508	status = mxge_send_cmd(sc, test_type, &cmd);
509	if (status != 0) {
510		test = "write";
511		goto abort;
512	}
513	sc->write_dma = ((cmd.data0>>16) * len * 2) /
514		(cmd.data0 & 0xffff);
515
516	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
517	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
518	cmd.data2 = len * 0x10001;
519	status = mxge_send_cmd(sc, test_type, &cmd);
520	if (status != 0) {
521		test = "read/write";
522		goto abort;
523	}
524	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
525		(cmd.data0 & 0xffff);
526
527abort:
528	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
529		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
530			      test, status);
531
532	return status;
533}
534
535/*
536 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
537 * when the PCI-E Completion packets are aligned on an 8-byte
538 * boundary.  Some PCI-E chip sets always align Completion packets; on
539 * the ones that do not, the alignment can be enforced by enabling
540 * ECRC generation (if supported).
541 *
542 * When PCI-E Completion packets are not aligned, it is actually more
543 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
544 *
545 * If the driver can neither enable ECRC nor verify that it has
546 * already been enabled, then it must use a firmware image which works
547 * around unaligned completion packets (ethp_z8e.dat), and it should
548 * also ensure that it never gives the device a Read-DMA which is
549 * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
550 * enabled, then the driver should use the aligned (eth_z8e.dat)
551 * firmware image, and set tx_boundary to 4KB.
552 */
553
554static int
555mxge_firmware_probe(mxge_softc_t *sc)
556{
557	device_t dev = sc->dev;
558	int reg, status;
559	uint16_t pectl;
560
561	sc->tx_boundary = 4096;
562	/*
563	 * Verify the max read request size was set to 4KB
564	 * before trying the test with 4KB.
565	 */
566	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
567		pectl = pci_read_config(dev, reg + 0x8, 2);
568		if ((pectl & (5 << 12)) != (5 << 12)) {
569			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
570				      pectl);
571			sc->tx_boundary = 2048;
572		}
573	}
574
575	/*
576	 * load the optimized firmware (which assumes aligned PCIe
577	 * completions) in order to see if it works on this host.
578	 */
579	sc->fw_name = mxge_fw_aligned;
580	status = mxge_load_firmware(sc, 1);
581	if (status != 0) {
582		return status;
583	}
584
585	/*
586	 * Enable ECRC if possible
587	 */
588	mxge_enable_nvidia_ecrc(sc);
589
590	/*
591	 * Run a DMA test which watches for unaligned completions and
592	 * aborts on the first one seen.  Not required on Z8ES or newer.
593	 */
594	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
595		return 0;
596	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
597	if (status == 0)
598		return 0; /* keep the aligned firmware */
599
600	if (status != E2BIG)
601		device_printf(dev, "DMA test failed: %d\n", status);
602	if (status == ENOSYS)
603		device_printf(dev, "Falling back to ethp! "
604			      "Please install up to date fw\n");
605	return status;
606}
607
608static int
609mxge_select_firmware(mxge_softc_t *sc)
610{
611	int aligned = 0;
612	int force_firmware = mxge_force_firmware;
613
614	if (sc->throttle)
615		force_firmware = sc->throttle;
616
617	if (force_firmware != 0) {
618		if (force_firmware == 1)
619			aligned = 1;
620		else
621			aligned = 0;
622		if (mxge_verbose)
623			device_printf(sc->dev,
624				      "Assuming %s completions (forced)\n",
625				      aligned ? "aligned" : "unaligned");
626		goto abort;
627	}
628
629	/* if the PCIe link width is 4 or less, we can use the aligned
630	   firmware and skip any checks */
631	if (sc->link_width != 0 && sc->link_width <= 4) {
632		device_printf(sc->dev,
633			      "PCIe x%d Link, expect reduced performance\n",
634			      sc->link_width);
635		aligned = 1;
636		goto abort;
637	}
638
639	if (0 == mxge_firmware_probe(sc))
640		return 0;
641
642abort:
643	if (aligned) {
644		sc->fw_name = mxge_fw_aligned;
645		sc->tx_boundary = 4096;
646	} else {
647		sc->fw_name = mxge_fw_unaligned;
648		sc->tx_boundary = 2048;
649	}
650	return (mxge_load_firmware(sc, 0));
651}
652
653static int
654mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
655{
656
657
658	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
659		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
660			      be32toh(hdr->mcp_type));
661		return EIO;
662	}
663
664	/* save firmware version for sysctl */
665	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
666	if (mxge_verbose)
667		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
668
669	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
670	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
671
672	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
673	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
674		device_printf(sc->dev, "Found firmware version %s\n",
675			      sc->fw_version);
676		device_printf(sc->dev, "Driver needs %d.%d\n",
677			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
678		return EINVAL;
679	}
680	return 0;
681
682}
683
684static void *
685z_alloc(void *nil, u_int items, u_int size)
686{
687        void *ptr;
688
689        ptr = malloc(items * size, M_TEMP, M_NOWAIT);
690        return ptr;
691}
692
693static void
694z_free(void *nil, void *ptr)
695{
696        free(ptr, M_TEMP);
697}
698
699
700static int
701mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
702{
703	z_stream zs;
704	char *inflate_buffer;
705	const struct firmware *fw;
706	const mcp_gen_header_t *hdr;
707	unsigned hdr_offset;
708	int status;
709	unsigned int i;
710	char dummy;
711	size_t fw_len;
712
713	fw = firmware_get(sc->fw_name);
714	if (fw == NULL) {
715		device_printf(sc->dev, "Could not find firmware image %s\n",
716			      sc->fw_name);
717		return ENOENT;
718	}
719
720
721
722	/* setup zlib and decompress f/w */
723	bzero(&zs, sizeof (zs));
724	zs.zalloc = z_alloc;
725	zs.zfree = z_free;
726	status = inflateInit(&zs);
727	if (status != Z_OK) {
728		status = EIO;
729		goto abort_with_fw;
730	}
731
732	/* the uncompressed size is stored as the firmware version,
733	   which would otherwise go unused */
734	fw_len = (size_t) fw->version;
735	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
736	if (inflate_buffer == NULL)
737		goto abort_with_zs;
738	zs.avail_in = fw->datasize;
739	zs.next_in = __DECONST(char *, fw->data);
740	zs.avail_out = fw_len;
741	zs.next_out = inflate_buffer;
742	status = inflate(&zs, Z_FINISH);
743	if (status != Z_STREAM_END) {
744		device_printf(sc->dev, "zlib %d\n", status);
745		status = EIO;
746		goto abort_with_buffer;
747	}
748
749	/* check id */
750	hdr_offset = htobe32(*(const uint32_t *)
751			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
752	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
753		device_printf(sc->dev, "Bad firmware file");
754		status = EIO;
755		goto abort_with_buffer;
756	}
757	hdr = (const void*)(inflate_buffer + hdr_offset);
758
759	status = mxge_validate_firmware(sc, hdr);
760	if (status != 0)
761		goto abort_with_buffer;
762
763	/* Copy the inflated firmware to NIC SRAM. */
764	for (i = 0; i < fw_len; i += 256) {
765		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
766			      inflate_buffer + i,
767			      min(256U, (unsigned)(fw_len - i)));
768		wmb();
769		dummy = *sc->sram;
770		wmb();
771	}
772
773	*limit = fw_len;
774	status = 0;
775abort_with_buffer:
776	free(inflate_buffer, M_TEMP);
777abort_with_zs:
778	inflateEnd(&zs);
779abort_with_fw:
780	firmware_put(fw, FIRMWARE_UNLOAD);
781	return status;
782}
783
784/*
785 * Enable or disable periodic RDMAs from the host to make certain
786 * chipsets resend dropped PCIe messages
787 */
788
789static void
790mxge_dummy_rdma(mxge_softc_t *sc, int enable)
791{
792	char buf_bytes[72];
793	volatile uint32_t *confirm;
794	volatile char *submit;
795	uint32_t *buf, dma_low, dma_high;
796	int i;
797
798	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
799
800	/* clear confirmation addr */
801	confirm = (volatile uint32_t *)sc->cmd;
802	*confirm = 0;
803	wmb();
804
805	/* send an rdma command to the PCIe engine, and wait for the
806	   response in the confirmation address.  The firmware should
807	   write a -1 there to indicate it is alive and well
808	*/
809
810	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
811	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
812	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
813	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
814	buf[2] = htobe32(0xffffffff);		/* confirm data */
815	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
816	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
817	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
818	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
819	buf[5] = htobe32(enable);			/* enable? */
820
821
822	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
823
824	mxge_pio_copy(submit, buf, 64);
825	wmb();
826	DELAY(1000);
827	wmb();
828	i = 0;
829	while (*confirm != 0xffffffff && i < 20) {
830		DELAY(1000);
831		i++;
832	}
833	if (*confirm != 0xffffffff) {
834		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
835			      (enable ? "enable" : "disable"), confirm,
836			      *confirm);
837	}
838	return;
839}
840
841static int
842mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
843{
844	mcp_cmd_t *buf;
845	char buf_bytes[sizeof(*buf) + 8];
846	volatile mcp_cmd_response_t *response = sc->cmd;
847	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
848	uint32_t dma_low, dma_high;
849	int err, sleep_total = 0;
850
851	/* ensure buf is aligned to 8 bytes */
852	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
853
854	buf->data0 = htobe32(data->data0);
855	buf->data1 = htobe32(data->data1);
856	buf->data2 = htobe32(data->data2);
857	buf->cmd = htobe32(cmd);
858	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
859	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
860
861	buf->response_addr.low = htobe32(dma_low);
862	buf->response_addr.high = htobe32(dma_high);
863	mtx_lock(&sc->cmd_mtx);
864	response->result = 0xffffffff;
865	wmb();
866	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
867
868	/* wait up to 20ms */
869	err = EAGAIN;
870	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
871		bus_dmamap_sync(sc->cmd_dma.dmat,
872				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
873		wmb();
874		switch (be32toh(response->result)) {
875		case 0:
876			data->data0 = be32toh(response->data);
877			err = 0;
878			break;
879		case 0xffffffff:
880			DELAY(1000);
881			break;
882		case MXGEFW_CMD_UNKNOWN:
883			err = ENOSYS;
884			break;
885		case MXGEFW_CMD_ERROR_UNALIGNED:
886			err = E2BIG;
887			break;
888		case MXGEFW_CMD_ERROR_BUSY:
889			err = EBUSY;
890			break;
891		case MXGEFW_CMD_ERROR_I2C_ABSENT:
892			err = ENXIO;
893			break;
894		default:
895			device_printf(sc->dev,
896				      "mxge: command %d "
897				      "failed, result = %d\n",
898				      cmd, be32toh(response->result));
899			err = ENXIO;
900			break;
901		}
902		if (err != EAGAIN)
903			break;
904	}
905	if (err == EAGAIN)
906		device_printf(sc->dev, "mxge: command %d timed out"
907			      "result = %d\n",
908			      cmd, be32toh(response->result));
909	mtx_unlock(&sc->cmd_mtx);
910	return err;
911}
912
913static int
914mxge_adopt_running_firmware(mxge_softc_t *sc)
915{
916	struct mcp_gen_header *hdr;
917	const size_t bytes = sizeof (struct mcp_gen_header);
918	size_t hdr_offset;
919	int status;
920
921	/* find running firmware header */
922	hdr_offset = htobe32(*(volatile uint32_t *)
923			     (sc->sram + MCP_HEADER_PTR_OFFSET));
924
925	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
926		device_printf(sc->dev,
927			      "Running firmware has bad header offset (%d)\n",
928			      (int)hdr_offset);
929		return EIO;
930	}
931
932	/* copy header of running firmware from SRAM to host memory to
933	 * validate firmware */
934	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
935	if (hdr == NULL) {
936		device_printf(sc->dev, "could not malloc firmware hdr\n");
937		return ENOMEM;
938	}
939	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
940				rman_get_bushandle(sc->mem_res),
941				hdr_offset, (char *)hdr, bytes);
942	status = mxge_validate_firmware(sc, hdr);
943	free(hdr, M_DEVBUF);
944
945	/*
946	 * check to see if adopted firmware has bug where adopting
947	 * it will cause broadcasts to be filtered unless the NIC
948	 * is kept in ALLMULTI mode
949	 */
950	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
951	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
952		sc->adopted_rx_filter_bug = 1;
953		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
954			      "working around rx filter bug\n",
955			      sc->fw_ver_major, sc->fw_ver_minor,
956			      sc->fw_ver_tiny);
957	}
958
959	return status;
960}
961
962
963static int
964mxge_load_firmware(mxge_softc_t *sc, int adopt)
965{
966	volatile uint32_t *confirm;
967	volatile char *submit;
968	char buf_bytes[72];
969	uint32_t *buf, size, dma_low, dma_high;
970	int status, i;
971
972	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
973
974	size = sc->sram_size;
975	status = mxge_load_firmware_helper(sc, &size);
976	if (status) {
977		if (!adopt)
978			return status;
979		/* Try to use the currently running firmware, if
980		   it is new enough */
981		status = mxge_adopt_running_firmware(sc);
982		if (status) {
983			device_printf(sc->dev,
984				      "failed to adopt running firmware\n");
985			return status;
986		}
987		device_printf(sc->dev,
988			      "Successfully adopted running firmware\n");
989		if (sc->tx_boundary == 4096) {
990			device_printf(sc->dev,
991				"Using firmware currently running on NIC"
992				 ".  For optimal\n");
993			device_printf(sc->dev,
994				 "performance consider loading optimized "
995				 "firmware\n");
996		}
997		sc->fw_name = mxge_fw_unaligned;
998		sc->tx_boundary = 2048;
999		return 0;
1000	}
1001	/* clear confirmation addr */
1002	confirm = (volatile uint32_t *)sc->cmd;
1003	*confirm = 0;
1004	wmb();
1005	/* send a reload command to the bootstrap MCP, and wait for the
1006	   response in the confirmation address.  The firmware should
1007	   write a -1 there to indicate it is alive and well
1008	*/
1009
1010	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1011	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1012
1013	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1014	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1015	buf[2] = htobe32(0xffffffff);	/* confirm data */
1016
1017	/* FIX: All newest firmware should un-protect the bottom of
1018	   the sram before handoff. However, the very first interfaces
1019	   do not. Therefore the handoff copy must skip the first 8 bytes
1020	*/
1021					/* where the code starts*/
1022	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1023	buf[4] = htobe32(size - 8); 	/* length of code */
1024	buf[5] = htobe32(8);		/* where to copy to */
1025	buf[6] = htobe32(0);		/* where to jump to */
1026
1027	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1028	mxge_pio_copy(submit, buf, 64);
1029	wmb();
1030	DELAY(1000);
1031	wmb();
1032	i = 0;
1033	while (*confirm != 0xffffffff && i < 20) {
1034		DELAY(1000*10);
1035		i++;
1036		bus_dmamap_sync(sc->cmd_dma.dmat,
1037				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1038	}
1039	if (*confirm != 0xffffffff) {
1040		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1041			confirm, *confirm);
1042
1043		return ENXIO;
1044	}
1045	return 0;
1046}
1047
1048static int
1049mxge_update_mac_address(mxge_softc_t *sc)
1050{
1051	mxge_cmd_t cmd;
1052	uint8_t *addr = sc->mac_addr;
1053	int status;
1054
1055
1056	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1057		     | (addr[2] << 8) | addr[3]);
1058
1059	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1060
1061	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1062	return status;
1063}
1064
1065static int
1066mxge_change_pause(mxge_softc_t *sc, int pause)
1067{
1068	mxge_cmd_t cmd;
1069	int status;
1070
1071	if (pause)
1072		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1073				       &cmd);
1074	else
1075		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1076				       &cmd);
1077
1078	if (status) {
1079		device_printf(sc->dev, "Failed to set flow control mode\n");
1080		return ENXIO;
1081	}
1082	sc->pause = pause;
1083	return 0;
1084}
1085
1086static void
1087mxge_change_promisc(mxge_softc_t *sc, int promisc)
1088{
1089	mxge_cmd_t cmd;
1090	int status;
1091
1092	if (mxge_always_promisc)
1093		promisc = 1;
1094
1095	if (promisc)
1096		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1097				       &cmd);
1098	else
1099		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1100				       &cmd);
1101
1102	if (status) {
1103		device_printf(sc->dev, "Failed to set promisc mode\n");
1104	}
1105}
1106
1107static void
1108mxge_set_multicast_list(mxge_softc_t *sc)
1109{
1110	mxge_cmd_t cmd;
1111	struct ifmultiaddr *ifma;
1112	struct ifnet *ifp = sc->ifp;
1113	int err;
1114
1115	/* This firmware is known to not support multicast */
1116	if (!sc->fw_multicast_support)
1117		return;
1118
1119	/* Disable multicast filtering while we play with the lists*/
1120	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1121	if (err != 0) {
1122		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1123		       " error status: %d\n", err);
1124		return;
1125	}
1126
1127	if (sc->adopted_rx_filter_bug)
1128		return;
1129
1130	if (ifp->if_flags & IFF_ALLMULTI)
1131		/* request to disable multicast filtering, so quit here */
1132		return;
1133
1134	/* Flush all the filters */
1135
1136	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1137	if (err != 0) {
1138		device_printf(sc->dev,
1139			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1140			      ", error status: %d\n", err);
1141		return;
1142	}
1143
1144	/* Walk the multicast list, and add each address */
1145
1146	if_maddr_rlock(ifp);
1147	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1148		if (ifma->ifma_addr->sa_family != AF_LINK)
1149			continue;
1150		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1151		      &cmd.data0, 4);
1152		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1153		      &cmd.data1, 2);
1154		cmd.data0 = htonl(cmd.data0);
1155		cmd.data1 = htonl(cmd.data1);
1156		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1157		if (err != 0) {
1158			device_printf(sc->dev, "Failed "
1159			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1160			       "%d\t", err);
1161			/* abort, leaving multicast filtering off */
1162			if_maddr_runlock(ifp);
1163			return;
1164		}
1165	}
1166	if_maddr_runlock(ifp);
1167	/* Enable multicast filtering */
1168	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1169	if (err != 0) {
1170		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1171		       ", error status: %d\n", err);
1172	}
1173}
1174
1175static int
1176mxge_max_mtu(mxge_softc_t *sc)
1177{
1178	mxge_cmd_t cmd;
1179	int status;
1180
1181	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1182		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1183
1184	/* try to set nbufs to see if it we can
1185	   use virtually contiguous jumbos */
1186	cmd.data0 = 0;
1187	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1188			       &cmd);
1189	if (status == 0)
1190		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1191
1192	/* otherwise, we're limited to MJUMPAGESIZE */
1193	return MJUMPAGESIZE - MXGEFW_PAD;
1194}
1195
1196static int
1197mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1198{
1199	struct mxge_slice_state *ss;
1200	mxge_rx_done_t *rx_done;
1201	volatile uint32_t *irq_claim;
1202	mxge_cmd_t cmd;
1203	int slice, status;
1204
1205	/* try to send a reset command to the card to see if it
1206	   is alive */
1207	memset(&cmd, 0, sizeof (cmd));
1208	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1209	if (status != 0) {
1210		device_printf(sc->dev, "failed reset\n");
1211		return ENXIO;
1212	}
1213
1214	mxge_dummy_rdma(sc, 1);
1215
1216
1217	/* set the intrq size */
1218	cmd.data0 = sc->rx_ring_size;
1219	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1220
1221	/*
1222	 * Even though we already know how many slices are supported
1223	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1224	 * has magic side effects, and must be called after a reset.
1225	 * It must be called prior to calling any RSS related cmds,
1226	 * including assigning an interrupt queue for anything but
1227	 * slice 0.  It must also be called *after*
1228	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1229	 * the firmware to compute offsets.
1230	 */
1231
1232	if (sc->num_slices > 1) {
1233		/* ask the maximum number of slices it supports */
1234		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1235					   &cmd);
1236		if (status != 0) {
1237			device_printf(sc->dev,
1238				      "failed to get number of slices\n");
1239			return status;
1240		}
1241		/*
1242		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1243		 * to setting up the interrupt queue DMA
1244		 */
1245		cmd.data0 = sc->num_slices;
1246		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1247#ifdef IFNET_BUF_RING
1248		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1249#endif
1250		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1251					   &cmd);
1252		if (status != 0) {
1253			device_printf(sc->dev,
1254				      "failed to set number of slices\n");
1255			return status;
1256		}
1257	}
1258
1259
1260	if (interrupts_setup) {
1261		/* Now exchange information about interrupts  */
1262		for (slice = 0; slice < sc->num_slices; slice++) {
1263			rx_done = &sc->ss[slice].rx_done;
1264			memset(rx_done->entry, 0, sc->rx_ring_size);
1265			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1266			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1267			cmd.data2 = slice;
1268			status |= mxge_send_cmd(sc,
1269						MXGEFW_CMD_SET_INTRQ_DMA,
1270						&cmd);
1271		}
1272	}
1273
1274	status |= mxge_send_cmd(sc,
1275				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1276
1277
1278	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1279
1280	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1281	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1282
1283
1284	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1285				&cmd);
1286	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1287	if (status != 0) {
1288		device_printf(sc->dev, "failed set interrupt parameters\n");
1289		return status;
1290	}
1291
1292
1293	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1294
1295
1296	/* run a DMA benchmark */
1297	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1298
1299	for (slice = 0; slice < sc->num_slices; slice++) {
1300		ss = &sc->ss[slice];
1301
1302		ss->irq_claim = irq_claim + (2 * slice);
1303		/* reset mcp/driver shared state back to 0 */
1304		ss->rx_done.idx = 0;
1305		ss->rx_done.cnt = 0;
1306		ss->tx.req = 0;
1307		ss->tx.done = 0;
1308		ss->tx.pkt_done = 0;
1309		ss->tx.queue_active = 0;
1310		ss->tx.activate = 0;
1311		ss->tx.deactivate = 0;
1312		ss->tx.wake = 0;
1313		ss->tx.defrag = 0;
1314		ss->tx.stall = 0;
1315		ss->rx_big.cnt = 0;
1316		ss->rx_small.cnt = 0;
1317		ss->lc.lro_bad_csum = 0;
1318		ss->lc.lro_queued = 0;
1319		ss->lc.lro_flushed = 0;
1320		if (ss->fw_stats != NULL) {
1321			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1322		}
1323	}
1324	sc->rdma_tags_available = 15;
1325	status = mxge_update_mac_address(sc);
1326	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1327	mxge_change_pause(sc, sc->pause);
1328	mxge_set_multicast_list(sc);
1329	if (sc->throttle) {
1330		cmd.data0 = sc->throttle;
1331		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1332				  &cmd)) {
1333			device_printf(sc->dev,
1334				      "can't enable throttle\n");
1335		}
1336	}
1337	return status;
1338}
1339
1340static int
1341mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1342{
1343	mxge_cmd_t cmd;
1344	mxge_softc_t *sc;
1345	int err;
1346	unsigned int throttle;
1347
1348	sc = arg1;
1349	throttle = sc->throttle;
1350	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1351        if (err != 0) {
1352                return err;
1353        }
1354
1355	if (throttle == sc->throttle)
1356		return 0;
1357
1358        if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1359                return EINVAL;
1360
1361	mtx_lock(&sc->driver_mtx);
1362	cmd.data0 = throttle;
1363	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1364	if (err == 0)
1365		sc->throttle = throttle;
1366	mtx_unlock(&sc->driver_mtx);
1367	return err;
1368}
1369
1370static int
1371mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1372{
1373        mxge_softc_t *sc;
1374        unsigned int intr_coal_delay;
1375        int err;
1376
1377        sc = arg1;
1378        intr_coal_delay = sc->intr_coal_delay;
1379        err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1380        if (err != 0) {
1381                return err;
1382        }
1383        if (intr_coal_delay == sc->intr_coal_delay)
1384                return 0;
1385
1386        if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1387                return EINVAL;
1388
1389	mtx_lock(&sc->driver_mtx);
1390	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1391	sc->intr_coal_delay = intr_coal_delay;
1392
1393	mtx_unlock(&sc->driver_mtx);
1394        return err;
1395}
1396
1397static int
1398mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1399{
1400        mxge_softc_t *sc;
1401        unsigned int enabled;
1402        int err;
1403
1404        sc = arg1;
1405        enabled = sc->pause;
1406        err = sysctl_handle_int(oidp, &enabled, arg2, req);
1407        if (err != 0) {
1408                return err;
1409        }
1410        if (enabled == sc->pause)
1411                return 0;
1412
1413	mtx_lock(&sc->driver_mtx);
1414	err = mxge_change_pause(sc, enabled);
1415	mtx_unlock(&sc->driver_mtx);
1416        return err;
1417}
1418
1419static int
1420mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1421{
1422        int err;
1423
1424        if (arg1 == NULL)
1425                return EFAULT;
1426        arg2 = be32toh(*(int *)arg1);
1427        arg1 = NULL;
1428        err = sysctl_handle_int(oidp, arg1, arg2, req);
1429
1430        return err;
1431}
1432
1433static void
1434mxge_rem_sysctls(mxge_softc_t *sc)
1435{
1436	struct mxge_slice_state *ss;
1437	int slice;
1438
1439	if (sc->slice_sysctl_tree == NULL)
1440		return;
1441
1442	for (slice = 0; slice < sc->num_slices; slice++) {
1443		ss = &sc->ss[slice];
1444		if (ss == NULL || ss->sysctl_tree == NULL)
1445			continue;
1446		sysctl_ctx_free(&ss->sysctl_ctx);
1447		ss->sysctl_tree = NULL;
1448	}
1449	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1450	sc->slice_sysctl_tree = NULL;
1451}
1452
1453static void
1454mxge_add_sysctls(mxge_softc_t *sc)
1455{
1456	struct sysctl_ctx_list *ctx;
1457	struct sysctl_oid_list *children;
1458	mcp_irq_data_t *fw;
1459	struct mxge_slice_state *ss;
1460	int slice;
1461	char slice_num[8];
1462
1463	ctx = device_get_sysctl_ctx(sc->dev);
1464	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1465	fw = sc->ss[0].fw_stats;
1466
1467	/* random information */
1468	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1469		       "firmware_version",
1470		       CTLFLAG_RD, &sc->fw_version,
1471		       0, "firmware version");
1472	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1473		       "serial_number",
1474		       CTLFLAG_RD, &sc->serial_number_string,
1475		       0, "serial number");
1476	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1477		       "product_code",
1478		       CTLFLAG_RD, &sc->product_code_string,
1479		       0, "product_code");
1480	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1481		       "pcie_link_width",
1482		       CTLFLAG_RD, &sc->link_width,
1483		       0, "tx_boundary");
1484	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1485		       "tx_boundary",
1486		       CTLFLAG_RD, &sc->tx_boundary,
1487		       0, "tx_boundary");
1488	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1489		       "write_combine",
1490		       CTLFLAG_RD, &sc->wc,
1491		       0, "write combining PIO?");
1492	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1493		       "read_dma_MBs",
1494		       CTLFLAG_RD, &sc->read_dma,
1495		       0, "DMA Read speed in MB/s");
1496	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1497		       "write_dma_MBs",
1498		       CTLFLAG_RD, &sc->write_dma,
1499		       0, "DMA Write speed in MB/s");
1500	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1501		       "read_write_dma_MBs",
1502		       CTLFLAG_RD, &sc->read_write_dma,
1503		       0, "DMA concurrent Read/Write speed in MB/s");
1504	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1505		       "watchdog_resets",
1506		       CTLFLAG_RD, &sc->watchdog_resets,
1507		       0, "Number of times NIC was reset");
1508
1509
1510	/* performance related tunables */
1511	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1512			"intr_coal_delay",
1513			CTLTYPE_INT|CTLFLAG_RW, sc,
1514			0, mxge_change_intr_coal,
1515			"I", "interrupt coalescing delay in usecs");
1516
1517	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1518			"throttle",
1519			CTLTYPE_INT|CTLFLAG_RW, sc,
1520			0, mxge_change_throttle,
1521			"I", "transmit throttling");
1522
1523	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1524			"flow_control_enabled",
1525			CTLTYPE_INT|CTLFLAG_RW, sc,
1526			0, mxge_change_flow_control,
1527			"I", "interrupt coalescing delay in usecs");
1528
1529	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1530		       "deassert_wait",
1531		       CTLFLAG_RW, &mxge_deassert_wait,
1532		       0, "Wait for IRQ line to go low in ihandler");
1533
1534	/* stats block from firmware is in network byte order.
1535	   Need to swap it */
1536	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1537			"link_up",
1538			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1539			0, mxge_handle_be32,
1540			"I", "link up");
1541	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1542			"rdma_tags_available",
1543			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1544			0, mxge_handle_be32,
1545			"I", "rdma_tags_available");
1546	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1547			"dropped_bad_crc32",
1548			CTLTYPE_INT|CTLFLAG_RD,
1549			&fw->dropped_bad_crc32,
1550			0, mxge_handle_be32,
1551			"I", "dropped_bad_crc32");
1552	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1553			"dropped_bad_phy",
1554			CTLTYPE_INT|CTLFLAG_RD,
1555			&fw->dropped_bad_phy,
1556			0, mxge_handle_be32,
1557			"I", "dropped_bad_phy");
1558	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1559			"dropped_link_error_or_filtered",
1560			CTLTYPE_INT|CTLFLAG_RD,
1561			&fw->dropped_link_error_or_filtered,
1562			0, mxge_handle_be32,
1563			"I", "dropped_link_error_or_filtered");
1564	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1565			"dropped_link_overflow",
1566			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1567			0, mxge_handle_be32,
1568			"I", "dropped_link_overflow");
1569	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1570			"dropped_multicast_filtered",
1571			CTLTYPE_INT|CTLFLAG_RD,
1572			&fw->dropped_multicast_filtered,
1573			0, mxge_handle_be32,
1574			"I", "dropped_multicast_filtered");
1575	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1576			"dropped_no_big_buffer",
1577			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1578			0, mxge_handle_be32,
1579			"I", "dropped_no_big_buffer");
1580	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1581			"dropped_no_small_buffer",
1582			CTLTYPE_INT|CTLFLAG_RD,
1583			&fw->dropped_no_small_buffer,
1584			0, mxge_handle_be32,
1585			"I", "dropped_no_small_buffer");
1586	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1587			"dropped_overrun",
1588			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1589			0, mxge_handle_be32,
1590			"I", "dropped_overrun");
1591	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1592			"dropped_pause",
1593			CTLTYPE_INT|CTLFLAG_RD,
1594			&fw->dropped_pause,
1595			0, mxge_handle_be32,
1596			"I", "dropped_pause");
1597	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1598			"dropped_runt",
1599			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1600			0, mxge_handle_be32,
1601			"I", "dropped_runt");
1602
1603	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1604			"dropped_unicast_filtered",
1605			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1606			0, mxge_handle_be32,
1607			"I", "dropped_unicast_filtered");
1608
1609	/* verbose printing? */
1610	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1611		       "verbose",
1612		       CTLFLAG_RW, &mxge_verbose,
1613		       0, "verbose printing");
1614
1615	/* add counters exported for debugging from all slices */
1616	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1617	sc->slice_sysctl_tree =
1618		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1619				"slice", CTLFLAG_RD, 0, "");
1620
1621	for (slice = 0; slice < sc->num_slices; slice++) {
1622		ss = &sc->ss[slice];
1623		sysctl_ctx_init(&ss->sysctl_ctx);
1624		ctx = &ss->sysctl_ctx;
1625		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1626		sprintf(slice_num, "%d", slice);
1627		ss->sysctl_tree =
1628			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1629					CTLFLAG_RD, 0, "");
1630		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1631		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1632			       "rx_small_cnt",
1633			       CTLFLAG_RD, &ss->rx_small.cnt,
1634			       0, "rx_small_cnt");
1635		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1636			       "rx_big_cnt",
1637			       CTLFLAG_RD, &ss->rx_big.cnt,
1638			       0, "rx_small_cnt");
1639		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1640			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1641			       0, "number of lro merge queues flushed");
1642
1643		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1644			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1645			       0, "number of bad csums preventing LRO");
1646
1647		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1648			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1649			       0, "number of frames appended to lro merge"
1650			       "queues");
1651
1652#ifndef IFNET_BUF_RING
1653		/* only transmit from slice 0 for now */
1654		if (slice > 0)
1655			continue;
1656#endif
1657		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1658			       "tx_req",
1659			       CTLFLAG_RD, &ss->tx.req,
1660			       0, "tx_req");
1661
1662		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1663			       "tx_done",
1664			       CTLFLAG_RD, &ss->tx.done,
1665			       0, "tx_done");
1666		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1667			       "tx_pkt_done",
1668			       CTLFLAG_RD, &ss->tx.pkt_done,
1669			       0, "tx_done");
1670		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1671			       "tx_stall",
1672			       CTLFLAG_RD, &ss->tx.stall,
1673			       0, "tx_stall");
1674		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1675			       "tx_wake",
1676			       CTLFLAG_RD, &ss->tx.wake,
1677			       0, "tx_wake");
1678		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1679			       "tx_defrag",
1680			       CTLFLAG_RD, &ss->tx.defrag,
1681			       0, "tx_defrag");
1682		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1683			       "tx_queue_active",
1684			       CTLFLAG_RD, &ss->tx.queue_active,
1685			       0, "tx_queue_active");
1686		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1687			       "tx_activate",
1688			       CTLFLAG_RD, &ss->tx.activate,
1689			       0, "tx_activate");
1690		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1691			       "tx_deactivate",
1692			       CTLFLAG_RD, &ss->tx.deactivate,
1693			       0, "tx_deactivate");
1694	}
1695}
1696
1697/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1698   backwards one at a time and handle ring wraps */
1699
1700static inline void
1701mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1702			    mcp_kreq_ether_send_t *src, int cnt)
1703{
1704        int idx, starting_slot;
1705        starting_slot = tx->req;
1706        while (cnt > 1) {
1707                cnt--;
1708                idx = (starting_slot + cnt) & tx->mask;
1709                mxge_pio_copy(&tx->lanai[idx],
1710			      &src[cnt], sizeof(*src));
1711                wmb();
1712        }
1713}
1714
1715/*
1716 * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1717 * at most 32 bytes at a time, so as to avoid involving the software
1718 * pio handler in the nic.   We re-write the first segment's flags
1719 * to mark them valid only after writing the entire chain
1720 */
1721
1722static inline void
1723mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1724                  int cnt)
1725{
1726        int idx, i;
1727        uint32_t *src_ints;
1728	volatile uint32_t *dst_ints;
1729        mcp_kreq_ether_send_t *srcp;
1730	volatile mcp_kreq_ether_send_t *dstp, *dst;
1731	uint8_t last_flags;
1732
1733        idx = tx->req & tx->mask;
1734
1735	last_flags = src->flags;
1736	src->flags = 0;
1737        wmb();
1738        dst = dstp = &tx->lanai[idx];
1739        srcp = src;
1740
1741        if ((idx + cnt) < tx->mask) {
1742                for (i = 0; i < (cnt - 1); i += 2) {
1743                        mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1744                        wmb(); /* force write every 32 bytes */
1745                        srcp += 2;
1746                        dstp += 2;
1747                }
1748        } else {
1749                /* submit all but the first request, and ensure
1750                   that it is submitted below */
1751                mxge_submit_req_backwards(tx, src, cnt);
1752                i = 0;
1753        }
1754        if (i < cnt) {
1755                /* submit the first request */
1756                mxge_pio_copy(dstp, srcp, sizeof(*src));
1757                wmb(); /* barrier before setting valid flag */
1758        }
1759
1760        /* re-write the last 32-bits with the valid flags */
1761        src->flags = last_flags;
1762        src_ints = (uint32_t *)src;
1763        src_ints+=3;
1764        dst_ints = (volatile uint32_t *)dst;
1765        dst_ints+=3;
1766        *dst_ints =  *src_ints;
1767        tx->req += cnt;
1768        wmb();
1769}
1770
1771static int
1772mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1773    struct mxge_pkt_info *pi)
1774{
1775	struct ether_vlan_header *eh;
1776	uint16_t etype;
1777	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1778#if IFCAP_TSO6 && defined(INET6)
1779	int nxt;
1780#endif
1781
1782	eh = mtod(m, struct ether_vlan_header *);
1783	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1784		etype = ntohs(eh->evl_proto);
1785		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1786	} else {
1787		etype = ntohs(eh->evl_encap_proto);
1788		pi->ip_off = ETHER_HDR_LEN;
1789	}
1790
1791	switch (etype) {
1792	case ETHERTYPE_IP:
1793		/*
1794		 * ensure ip header is in first mbuf, copy it to a
1795		 * scratch buffer if not
1796		 */
1797		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1798		pi->ip6 = NULL;
1799		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1800			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1801			    ss->scratch);
1802			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1803		}
1804		pi->ip_hlen = pi->ip->ip_hl << 2;
1805		if (!tso)
1806			return 0;
1807
1808		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1809		    sizeof(struct tcphdr))) {
1810			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1811			    sizeof(struct tcphdr), ss->scratch);
1812			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1813		}
1814		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1815		break;
1816#if IFCAP_TSO6 && defined(INET6)
1817	case ETHERTYPE_IPV6:
1818		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1819		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1820			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1821			    ss->scratch);
1822			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1823		}
1824		nxt = 0;
1825		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1826		pi->ip_hlen -= pi->ip_off;
1827		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1828			return EINVAL;
1829
1830		if (!tso)
1831			return 0;
1832
1833		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1834			return EINVAL;
1835
1836		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1837		    sizeof(struct tcphdr))) {
1838			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1839			    sizeof(struct tcphdr), ss->scratch);
1840			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1841		}
1842		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1843		break;
1844#endif
1845	default:
1846		return EINVAL;
1847	}
1848	return 0;
1849}
1850
1851#if IFCAP_TSO4
1852
1853static void
1854mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1855	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1856{
1857	mxge_tx_ring_t *tx;
1858	mcp_kreq_ether_send_t *req;
1859	bus_dma_segment_t *seg;
1860	uint32_t low, high_swapped;
1861	int len, seglen, cum_len, cum_len_next;
1862	int next_is_first, chop, cnt, rdma_count, small;
1863	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1864	uint8_t flags, flags_next;
1865	static int once;
1866
1867	mss = m->m_pkthdr.tso_segsz;
1868
1869	/* negative cum_len signifies to the
1870	 * send loop that we are still in the
1871	 * header portion of the TSO packet.
1872	 */
1873
1874	cksum_offset = pi->ip_off + pi->ip_hlen;
1875	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1876
1877	/* TSO implies checksum offload on this hardware */
1878	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1879		/*
1880		 * If packet has full TCP csum, replace it with pseudo hdr
1881		 * sum that the NIC expects, otherwise the NIC will emit
1882		 * packets with bad TCP checksums.
1883		 */
1884		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1885		if (pi->ip6) {
1886#if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1887			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1888			sum = in6_cksum_pseudo(pi->ip6,
1889			    m->m_pkthdr.len - cksum_offset,
1890			    IPPROTO_TCP, 0);
1891#endif
1892		} else {
1893#ifdef INET
1894			m->m_pkthdr.csum_flags |= CSUM_TCP;
1895			sum = in_pseudo(pi->ip->ip_src.s_addr,
1896			    pi->ip->ip_dst.s_addr,
1897			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1898				    cksum_offset)));
1899#endif
1900		}
1901		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1902		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1903	}
1904	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1905
1906
1907	/* for TSO, pseudo_hdr_offset holds mss.
1908	 * The firmware figures out where to put
1909	 * the checksum by parsing the header. */
1910	pseudo_hdr_offset = htobe16(mss);
1911
1912	if (pi->ip6) {
1913		/*
1914		 * for IPv6 TSO, the "checksum offset" is re-purposed
1915		 * to store the TCP header len
1916		 */
1917		cksum_offset = (pi->tcp->th_off << 2);
1918	}
1919
1920	tx = &ss->tx;
1921	req = tx->req_list;
1922	seg = tx->seg_list;
1923	cnt = 0;
1924	rdma_count = 0;
1925	/* "rdma_count" is the number of RDMAs belonging to the
1926	 * current packet BEFORE the current send request. For
1927	 * non-TSO packets, this is equal to "count".
1928	 * For TSO packets, rdma_count needs to be reset
1929	 * to 0 after a segment cut.
1930	 *
1931	 * The rdma_count field of the send request is
1932	 * the number of RDMAs of the packet starting at
1933	 * that request. For TSO send requests with one ore more cuts
1934	 * in the middle, this is the number of RDMAs starting
1935	 * after the last cut in the request. All previous
1936	 * segments before the last cut implicitly have 1 RDMA.
1937	 *
1938	 * Since the number of RDMAs is not known beforehand,
1939	 * it must be filled-in retroactively - after each
1940	 * segmentation cut or at the end of the entire packet.
1941	 */
1942
1943	while (busdma_seg_cnt) {
1944		/* Break the busdma segment up into pieces*/
1945		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1946		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1947		len = seg->ds_len;
1948
1949		while (len) {
1950			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1951			seglen = len;
1952			cum_len_next = cum_len + seglen;
1953			(req-rdma_count)->rdma_count = rdma_count + 1;
1954			if (__predict_true(cum_len >= 0)) {
1955				/* payload */
1956				chop = (cum_len_next > mss);
1957				cum_len_next = cum_len_next % mss;
1958				next_is_first = (cum_len_next == 0);
1959				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1960				flags_next |= next_is_first *
1961					MXGEFW_FLAGS_FIRST;
1962				rdma_count |= -(chop | next_is_first);
1963				rdma_count += chop & !next_is_first;
1964			} else if (cum_len_next >= 0) {
1965				/* header ends */
1966				rdma_count = -1;
1967				cum_len_next = 0;
1968				seglen = -cum_len;
1969				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1970				flags_next = MXGEFW_FLAGS_TSO_PLD |
1971					MXGEFW_FLAGS_FIRST |
1972					(small * MXGEFW_FLAGS_SMALL);
1973			    }
1974
1975			req->addr_high = high_swapped;
1976			req->addr_low = htobe32(low);
1977			req->pseudo_hdr_offset = pseudo_hdr_offset;
1978			req->pad = 0;
1979			req->rdma_count = 1;
1980			req->length = htobe16(seglen);
1981			req->cksum_offset = cksum_offset;
1982			req->flags = flags | ((cum_len & 1) *
1983					      MXGEFW_FLAGS_ALIGN_ODD);
1984			low += seglen;
1985			len -= seglen;
1986			cum_len = cum_len_next;
1987			flags = flags_next;
1988			req++;
1989			cnt++;
1990			rdma_count++;
1991			if (cksum_offset != 0 && !pi->ip6) {
1992				if (__predict_false(cksum_offset > seglen))
1993					cksum_offset -= seglen;
1994				else
1995					cksum_offset = 0;
1996			}
1997			if (__predict_false(cnt > tx->max_desc))
1998				goto drop;
1999		}
2000		busdma_seg_cnt--;
2001		seg++;
2002	}
2003	(req-rdma_count)->rdma_count = rdma_count;
2004
2005	do {
2006		req--;
2007		req->flags |= MXGEFW_FLAGS_TSO_LAST;
2008	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2009
2010	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2011	mxge_submit_req(tx, tx->req_list, cnt);
2012#ifdef IFNET_BUF_RING
2013	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2014		/* tell the NIC to start polling this slice */
2015		*tx->send_go = 1;
2016		tx->queue_active = 1;
2017		tx->activate++;
2018		wmb();
2019	}
2020#endif
2021	return;
2022
2023drop:
2024	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2025	m_freem(m);
2026	ss->oerrors++;
2027	if (!once) {
2028		printf("tx->max_desc exceeded via TSO!\n");
2029		printf("mss = %d, %ld, %d!\n", mss,
2030		       (long)seg - (long)tx->seg_list, tx->max_desc);
2031		once = 1;
2032	}
2033	return;
2034
2035}
2036
2037#endif /* IFCAP_TSO4 */
2038
2039#ifdef MXGE_NEW_VLAN_API
2040/*
2041 * We reproduce the software vlan tag insertion from
2042 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2043 * vlan tag insertion. We need to advertise this in order to have the
2044 * vlan interface respect our csum offload flags.
2045 */
2046static struct mbuf *
2047mxge_vlan_tag_insert(struct mbuf *m)
2048{
2049	struct ether_vlan_header *evl;
2050
2051	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2052	if (__predict_false(m == NULL))
2053		return NULL;
2054	if (m->m_len < sizeof(*evl)) {
2055		m = m_pullup(m, sizeof(*evl));
2056		if (__predict_false(m == NULL))
2057			return NULL;
2058	}
2059	/*
2060	 * Transform the Ethernet header into an Ethernet header
2061	 * with 802.1Q encapsulation.
2062	 */
2063	evl = mtod(m, struct ether_vlan_header *);
2064	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2065	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2066	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2067	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2068	m->m_flags &= ~M_VLANTAG;
2069	return m;
2070}
2071#endif /* MXGE_NEW_VLAN_API */
2072
2073static void
2074mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2075{
2076	struct mxge_pkt_info pi = {0,0,0,0};
2077	mxge_softc_t *sc;
2078	mcp_kreq_ether_send_t *req;
2079	bus_dma_segment_t *seg;
2080	struct mbuf *m_tmp;
2081	struct ifnet *ifp;
2082	mxge_tx_ring_t *tx;
2083	int cnt, cum_len, err, i, idx, odd_flag;
2084	uint16_t pseudo_hdr_offset;
2085        uint8_t flags, cksum_offset;
2086
2087
2088	sc = ss->sc;
2089	ifp = sc->ifp;
2090	tx = &ss->tx;
2091
2092#ifdef MXGE_NEW_VLAN_API
2093	if (m->m_flags & M_VLANTAG) {
2094		m = mxge_vlan_tag_insert(m);
2095		if (__predict_false(m == NULL))
2096			goto drop_without_m;
2097	}
2098#endif
2099	if (m->m_pkthdr.csum_flags &
2100	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2101		if (mxge_parse_tx(ss, m, &pi))
2102			goto drop;
2103	}
2104
2105	/* (try to) map the frame for DMA */
2106	idx = tx->req & tx->mask;
2107	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2108				      m, tx->seg_list, &cnt,
2109				      BUS_DMA_NOWAIT);
2110	if (__predict_false(err == EFBIG)) {
2111		/* Too many segments in the chain.  Try
2112		   to defrag */
2113		m_tmp = m_defrag(m, M_NOWAIT);
2114		if (m_tmp == NULL) {
2115			goto drop;
2116		}
2117		ss->tx.defrag++;
2118		m = m_tmp;
2119		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2120					      tx->info[idx].map,
2121					      m, tx->seg_list, &cnt,
2122					      BUS_DMA_NOWAIT);
2123	}
2124	if (__predict_false(err != 0)) {
2125		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2126			      " packet len = %d\n", err, m->m_pkthdr.len);
2127		goto drop;
2128	}
2129	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2130			BUS_DMASYNC_PREWRITE);
2131	tx->info[idx].m = m;
2132
2133#if IFCAP_TSO4
2134	/* TSO is different enough, we handle it in another routine */
2135	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2136		mxge_encap_tso(ss, m, cnt, &pi);
2137		return;
2138	}
2139#endif
2140
2141	req = tx->req_list;
2142	cksum_offset = 0;
2143	pseudo_hdr_offset = 0;
2144	flags = MXGEFW_FLAGS_NO_TSO;
2145
2146	/* checksum offloading? */
2147	if (m->m_pkthdr.csum_flags &
2148	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2149		/* ensure ip header is in first mbuf, copy
2150		   it to a scratch buffer if not */
2151		cksum_offset = pi.ip_off + pi.ip_hlen;
2152		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2153		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2154		req->cksum_offset = cksum_offset;
2155		flags |= MXGEFW_FLAGS_CKSUM;
2156		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2157	} else {
2158		odd_flag = 0;
2159	}
2160	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2161		flags |= MXGEFW_FLAGS_SMALL;
2162
2163	/* convert segments into a request list */
2164	cum_len = 0;
2165	seg = tx->seg_list;
2166	req->flags = MXGEFW_FLAGS_FIRST;
2167	for (i = 0; i < cnt; i++) {
2168		req->addr_low =
2169			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2170		req->addr_high =
2171			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2172		req->length = htobe16(seg->ds_len);
2173		req->cksum_offset = cksum_offset;
2174		if (cksum_offset > seg->ds_len)
2175			cksum_offset -= seg->ds_len;
2176		else
2177			cksum_offset = 0;
2178		req->pseudo_hdr_offset = pseudo_hdr_offset;
2179		req->pad = 0; /* complete solid 16-byte block */
2180		req->rdma_count = 1;
2181		req->flags |= flags | ((cum_len & 1) * odd_flag);
2182		cum_len += seg->ds_len;
2183		seg++;
2184		req++;
2185		req->flags = 0;
2186	}
2187	req--;
2188	/* pad runts to 60 bytes */
2189	if (cum_len < 60) {
2190		req++;
2191		req->addr_low =
2192			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2193		req->addr_high =
2194			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2195		req->length = htobe16(60 - cum_len);
2196		req->cksum_offset = 0;
2197		req->pseudo_hdr_offset = pseudo_hdr_offset;
2198		req->pad = 0; /* complete solid 16-byte block */
2199		req->rdma_count = 1;
2200		req->flags |= flags | ((cum_len & 1) * odd_flag);
2201		cnt++;
2202	}
2203
2204	tx->req_list[0].rdma_count = cnt;
2205#if 0
2206	/* print what the firmware will see */
2207	for (i = 0; i < cnt; i++) {
2208		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2209		    "cso:%d, flags:0x%x, rdma:%d\n",
2210		    i, (int)ntohl(tx->req_list[i].addr_high),
2211		    (int)ntohl(tx->req_list[i].addr_low),
2212		    (int)ntohs(tx->req_list[i].length),
2213		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2214		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2215		    tx->req_list[i].rdma_count);
2216	}
2217	printf("--------------\n");
2218#endif
2219	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2220	mxge_submit_req(tx, tx->req_list, cnt);
2221#ifdef IFNET_BUF_RING
2222	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2223		/* tell the NIC to start polling this slice */
2224		*tx->send_go = 1;
2225		tx->queue_active = 1;
2226		tx->activate++;
2227		wmb();
2228	}
2229#endif
2230	return;
2231
2232drop:
2233	m_freem(m);
2234drop_without_m:
2235	ss->oerrors++;
2236	return;
2237}
2238
2239#ifdef IFNET_BUF_RING
2240static void
2241mxge_qflush(struct ifnet *ifp)
2242{
2243	mxge_softc_t *sc = ifp->if_softc;
2244	mxge_tx_ring_t *tx;
2245	struct mbuf *m;
2246	int slice;
2247
2248	for (slice = 0; slice < sc->num_slices; slice++) {
2249		tx = &sc->ss[slice].tx;
2250		mtx_lock(&tx->mtx);
2251		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2252			m_freem(m);
2253		mtx_unlock(&tx->mtx);
2254	}
2255	if_qflush(ifp);
2256}
2257
2258static inline void
2259mxge_start_locked(struct mxge_slice_state *ss)
2260{
2261	mxge_softc_t *sc;
2262	struct mbuf *m;
2263	struct ifnet *ifp;
2264	mxge_tx_ring_t *tx;
2265
2266	sc = ss->sc;
2267	ifp = sc->ifp;
2268	tx = &ss->tx;
2269
2270	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2271		m = drbr_dequeue(ifp, tx->br);
2272		if (m == NULL) {
2273			return;
2274		}
2275		/* let BPF see it */
2276		BPF_MTAP(ifp, m);
2277
2278		/* give it to the nic */
2279		mxge_encap(ss, m);
2280	}
2281	/* ran out of transmit slots */
2282	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2283	    && (!drbr_empty(ifp, tx->br))) {
2284		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2285		tx->stall++;
2286	}
2287}
2288
2289static int
2290mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2291{
2292	mxge_softc_t *sc;
2293	struct ifnet *ifp;
2294	mxge_tx_ring_t *tx;
2295	int err;
2296
2297	sc = ss->sc;
2298	ifp = sc->ifp;
2299	tx = &ss->tx;
2300
2301	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2302	    IFF_DRV_RUNNING) {
2303		err = drbr_enqueue(ifp, tx->br, m);
2304		return (err);
2305	}
2306
2307	if (!drbr_needs_enqueue(ifp, tx->br) &&
2308	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2309		/* let BPF see it */
2310		BPF_MTAP(ifp, m);
2311		/* give it to the nic */
2312		mxge_encap(ss, m);
2313	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2314		return (err);
2315	}
2316	if (!drbr_empty(ifp, tx->br))
2317		mxge_start_locked(ss);
2318	return (0);
2319}
2320
2321static int
2322mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2323{
2324	mxge_softc_t *sc = ifp->if_softc;
2325	struct mxge_slice_state *ss;
2326	mxge_tx_ring_t *tx;
2327	int err = 0;
2328	int slice;
2329
2330	slice = m->m_pkthdr.flowid;
2331	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2332
2333	ss = &sc->ss[slice];
2334	tx = &ss->tx;
2335
2336	if (mtx_trylock(&tx->mtx)) {
2337		err = mxge_transmit_locked(ss, m);
2338		mtx_unlock(&tx->mtx);
2339	} else {
2340		err = drbr_enqueue(ifp, tx->br, m);
2341	}
2342
2343	return (err);
2344}
2345
2346#else
2347
2348static inline void
2349mxge_start_locked(struct mxge_slice_state *ss)
2350{
2351	mxge_softc_t *sc;
2352	struct mbuf *m;
2353	struct ifnet *ifp;
2354	mxge_tx_ring_t *tx;
2355
2356	sc = ss->sc;
2357	ifp = sc->ifp;
2358	tx = &ss->tx;
2359	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2360		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2361		if (m == NULL) {
2362			return;
2363		}
2364		/* let BPF see it */
2365		BPF_MTAP(ifp, m);
2366
2367		/* give it to the nic */
2368		mxge_encap(ss, m);
2369	}
2370	/* ran out of transmit slots */
2371	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2372		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2373		tx->stall++;
2374	}
2375}
2376#endif
2377static void
2378mxge_start(struct ifnet *ifp)
2379{
2380	mxge_softc_t *sc = ifp->if_softc;
2381	struct mxge_slice_state *ss;
2382
2383	/* only use the first slice for now */
2384	ss = &sc->ss[0];
2385	mtx_lock(&ss->tx.mtx);
2386	mxge_start_locked(ss);
2387	mtx_unlock(&ss->tx.mtx);
2388}
2389
2390/*
2391 * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2392 * at most 32 bytes at a time, so as to avoid involving the software
2393 * pio handler in the nic.   We re-write the first segment's low
2394 * DMA address to mark it valid only after we write the entire chunk
2395 * in a burst
2396 */
2397static inline void
2398mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2399		mcp_kreq_ether_recv_t *src)
2400{
2401	uint32_t low;
2402
2403	low = src->addr_low;
2404	src->addr_low = 0xffffffff;
2405	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2406	wmb();
2407	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2408	wmb();
2409	src->addr_low = low;
2410	dst->addr_low = low;
2411	wmb();
2412}
2413
2414static int
2415mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2416{
2417	bus_dma_segment_t seg;
2418	struct mbuf *m;
2419	mxge_rx_ring_t *rx = &ss->rx_small;
2420	int cnt, err;
2421
2422	m = m_gethdr(M_NOWAIT, MT_DATA);
2423	if (m == NULL) {
2424		rx->alloc_fail++;
2425		err = ENOBUFS;
2426		goto done;
2427	}
2428	m->m_len = MHLEN;
2429	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2430				      &seg, &cnt, BUS_DMA_NOWAIT);
2431	if (err != 0) {
2432		m_free(m);
2433		goto done;
2434	}
2435	rx->info[idx].m = m;
2436	rx->shadow[idx].addr_low =
2437		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2438	rx->shadow[idx].addr_high =
2439		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2440
2441done:
2442	if ((idx & 7) == 7)
2443		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2444	return err;
2445}
2446
2447static int
2448mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2449{
2450	bus_dma_segment_t seg[3];
2451	struct mbuf *m;
2452	mxge_rx_ring_t *rx = &ss->rx_big;
2453	int cnt, err, i;
2454
2455	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2456	if (m == NULL) {
2457		rx->alloc_fail++;
2458		err = ENOBUFS;
2459		goto done;
2460	}
2461	m->m_len = rx->mlen;
2462	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2463				      seg, &cnt, BUS_DMA_NOWAIT);
2464	if (err != 0) {
2465		m_free(m);
2466		goto done;
2467	}
2468	rx->info[idx].m = m;
2469	rx->shadow[idx].addr_low =
2470		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2471	rx->shadow[idx].addr_high =
2472		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2473
2474#if MXGE_VIRT_JUMBOS
2475	for (i = 1; i < cnt; i++) {
2476		rx->shadow[idx + i].addr_low =
2477			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2478		rx->shadow[idx + i].addr_high =
2479			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2480       }
2481#endif
2482
2483done:
2484       for (i = 0; i < rx->nbufs; i++) {
2485		if ((idx & 7) == 7) {
2486			mxge_submit_8rx(&rx->lanai[idx - 7],
2487					&rx->shadow[idx - 7]);
2488		}
2489		idx++;
2490	}
2491	return err;
2492}
2493
2494#ifdef INET6
2495
2496static uint16_t
2497mxge_csum_generic(uint16_t *raw, int len)
2498{
2499	uint32_t csum;
2500
2501
2502	csum = 0;
2503	while (len > 0) {
2504		csum += *raw;
2505		raw++;
2506		len -= 2;
2507	}
2508	csum = (csum >> 16) + (csum & 0xffff);
2509	csum = (csum >> 16) + (csum & 0xffff);
2510	return (uint16_t)csum;
2511}
2512
2513static inline uint16_t
2514mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2515{
2516	uint32_t partial;
2517	int nxt, cksum_offset;
2518	struct ip6_hdr *ip6 = p;
2519	uint16_t c;
2520
2521	nxt = ip6->ip6_nxt;
2522	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2523	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2524		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2525					   IPPROTO_IPV6, &nxt);
2526		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2527			return (1);
2528	}
2529
2530	/*
2531	 * IPv6 headers do not contain a checksum, and hence
2532	 * do not checksum to zero, so they don't "fall out"
2533	 * of the partial checksum calculation like IPv4
2534	 * headers do.  We need to fix the partial checksum by
2535	 * subtracting the checksum of the IPv6 header.
2536	 */
2537
2538	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2539				    ETHER_HDR_LEN);
2540	csum += ~partial;
2541	csum +=	 (csum < ~partial);
2542	csum = (csum >> 16) + (csum & 0xFFFF);
2543	csum = (csum >> 16) + (csum & 0xFFFF);
2544	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2545			     csum);
2546	c ^= 0xffff;
2547	return (c);
2548}
2549#endif /* INET6 */
2550/*
2551 *  Myri10GE hardware checksums are not valid if the sender
2552 *  padded the frame with non-zero padding.  This is because
2553 *  the firmware just does a simple 16-bit 1s complement
2554 *  checksum across the entire frame, excluding the first 14
2555 *  bytes.  It is best to simply to check the checksum and
2556 *  tell the stack about it only if the checksum is good
2557 */
2558
2559static inline uint16_t
2560mxge_rx_csum(struct mbuf *m, int csum)
2561{
2562	struct ether_header *eh;
2563#ifdef INET
2564	struct ip *ip;
2565#endif
2566#if defined(INET) || defined(INET6)
2567	int cap = m->m_pkthdr.rcvif->if_capenable;
2568#endif
2569	uint16_t c, etype;
2570
2571
2572	eh = mtod(m, struct ether_header *);
2573	etype = ntohs(eh->ether_type);
2574	switch (etype) {
2575#ifdef INET
2576	case ETHERTYPE_IP:
2577		if ((cap & IFCAP_RXCSUM) == 0)
2578			return (1);
2579		ip = (struct ip *)(eh + 1);
2580		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2581			return (1);
2582		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2583			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2584				    (ip->ip_hl << 2) + ip->ip_p));
2585		c ^= 0xffff;
2586		break;
2587#endif
2588#ifdef INET6
2589	case ETHERTYPE_IPV6:
2590		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2591			return (1);
2592		c = mxge_rx_csum6((eh + 1), m, csum);
2593		break;
2594#endif
2595	default:
2596		c = 1;
2597	}
2598	return (c);
2599}
2600
2601static void
2602mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2603{
2604	struct ether_vlan_header *evl;
2605	struct ether_header *eh;
2606	uint32_t partial;
2607
2608	evl = mtod(m, struct ether_vlan_header *);
2609	eh = mtod(m, struct ether_header *);
2610
2611	/*
2612	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2613	 * after what the firmware thought was the end of the ethernet
2614	 * header.
2615	 */
2616
2617	/* put checksum into host byte order */
2618	*csum = ntohs(*csum);
2619	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2620	(*csum) += ~partial;
2621	(*csum) +=  ((*csum) < ~partial);
2622	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2623	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2624
2625	/* restore checksum to network byte order;
2626	   later consumers expect this */
2627	*csum = htons(*csum);
2628
2629	/* save the tag */
2630#ifdef MXGE_NEW_VLAN_API
2631	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2632#else
2633	{
2634		struct m_tag *mtag;
2635		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2636				   M_NOWAIT);
2637		if (mtag == NULL)
2638			return;
2639		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2640		m_tag_prepend(m, mtag);
2641	}
2642
2643#endif
2644	m->m_flags |= M_VLANTAG;
2645
2646	/*
2647	 * Remove the 802.1q header by copying the Ethernet
2648	 * addresses over it and adjusting the beginning of
2649	 * the data in the mbuf.  The encapsulated Ethernet
2650	 * type field is already in place.
2651	 */
2652	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2653	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2654	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2655}
2656
2657
2658static inline void
2659mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2660		 uint32_t csum, int lro)
2661{
2662	mxge_softc_t *sc;
2663	struct ifnet *ifp;
2664	struct mbuf *m;
2665	struct ether_header *eh;
2666	mxge_rx_ring_t *rx;
2667	bus_dmamap_t old_map;
2668	int idx;
2669
2670	sc = ss->sc;
2671	ifp = sc->ifp;
2672	rx = &ss->rx_big;
2673	idx = rx->cnt & rx->mask;
2674	rx->cnt += rx->nbufs;
2675	/* save a pointer to the received mbuf */
2676	m = rx->info[idx].m;
2677	/* try to replace the received mbuf */
2678	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2679		/* drop the frame -- the old mbuf is re-cycled */
2680		ifp->if_ierrors++;
2681		return;
2682	}
2683
2684	/* unmap the received buffer */
2685	old_map = rx->info[idx].map;
2686	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2687	bus_dmamap_unload(rx->dmat, old_map);
2688
2689	/* swap the bus_dmamap_t's */
2690	rx->info[idx].map = rx->extra_map;
2691	rx->extra_map = old_map;
2692
2693	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2694	 * aligned */
2695	m->m_data += MXGEFW_PAD;
2696
2697	m->m_pkthdr.rcvif = ifp;
2698	m->m_len = m->m_pkthdr.len = len;
2699	ss->ipackets++;
2700	eh = mtod(m, struct ether_header *);
2701	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2702		mxge_vlan_tag_remove(m, &csum);
2703	}
2704	/* if the checksum is valid, mark it in the mbuf header */
2705
2706	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2707	    (0 == mxge_rx_csum(m, csum))) {
2708		/* Tell the stack that the  checksum is good */
2709		m->m_pkthdr.csum_data = 0xffff;
2710		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2711			CSUM_DATA_VALID;
2712
2713#if defined(INET) || defined (INET6)
2714		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2715			return;
2716#endif
2717	}
2718	/* flowid only valid if RSS hashing is enabled */
2719	if (sc->num_slices > 1) {
2720		m->m_pkthdr.flowid = (ss - sc->ss);
2721		m->m_flags |= M_FLOWID;
2722	}
2723	/* pass the frame up the stack */
2724	(*ifp->if_input)(ifp, m);
2725}
2726
2727static inline void
2728mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2729		   uint32_t csum, int lro)
2730{
2731	mxge_softc_t *sc;
2732	struct ifnet *ifp;
2733	struct ether_header *eh;
2734	struct mbuf *m;
2735	mxge_rx_ring_t *rx;
2736	bus_dmamap_t old_map;
2737	int idx;
2738
2739	sc = ss->sc;
2740	ifp = sc->ifp;
2741	rx = &ss->rx_small;
2742	idx = rx->cnt & rx->mask;
2743	rx->cnt++;
2744	/* save a pointer to the received mbuf */
2745	m = rx->info[idx].m;
2746	/* try to replace the received mbuf */
2747	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2748		/* drop the frame -- the old mbuf is re-cycled */
2749		ifp->if_ierrors++;
2750		return;
2751	}
2752
2753	/* unmap the received buffer */
2754	old_map = rx->info[idx].map;
2755	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2756	bus_dmamap_unload(rx->dmat, old_map);
2757
2758	/* swap the bus_dmamap_t's */
2759	rx->info[idx].map = rx->extra_map;
2760	rx->extra_map = old_map;
2761
2762	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2763	 * aligned */
2764	m->m_data += MXGEFW_PAD;
2765
2766	m->m_pkthdr.rcvif = ifp;
2767	m->m_len = m->m_pkthdr.len = len;
2768	ss->ipackets++;
2769	eh = mtod(m, struct ether_header *);
2770	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2771		mxge_vlan_tag_remove(m, &csum);
2772	}
2773	/* if the checksum is valid, mark it in the mbuf header */
2774	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2775	    (0 == mxge_rx_csum(m, csum))) {
2776		/* Tell the stack that the  checksum is good */
2777		m->m_pkthdr.csum_data = 0xffff;
2778		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2779			CSUM_DATA_VALID;
2780
2781#if defined(INET) || defined (INET6)
2782		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2783			return;
2784#endif
2785	}
2786	/* flowid only valid if RSS hashing is enabled */
2787	if (sc->num_slices > 1) {
2788		m->m_pkthdr.flowid = (ss - sc->ss);
2789		m->m_flags |= M_FLOWID;
2790	}
2791	/* pass the frame up the stack */
2792	(*ifp->if_input)(ifp, m);
2793}
2794
2795static inline void
2796mxge_clean_rx_done(struct mxge_slice_state *ss)
2797{
2798	mxge_rx_done_t *rx_done = &ss->rx_done;
2799	int limit = 0;
2800	uint16_t length;
2801	uint16_t checksum;
2802	int lro;
2803
2804	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2805	while (rx_done->entry[rx_done->idx].length != 0) {
2806		length = ntohs(rx_done->entry[rx_done->idx].length);
2807		rx_done->entry[rx_done->idx].length = 0;
2808		checksum = rx_done->entry[rx_done->idx].checksum;
2809		if (length <= (MHLEN - MXGEFW_PAD))
2810			mxge_rx_done_small(ss, length, checksum, lro);
2811		else
2812			mxge_rx_done_big(ss, length, checksum, lro);
2813		rx_done->cnt++;
2814		rx_done->idx = rx_done->cnt & rx_done->mask;
2815
2816		/* limit potential for livelock */
2817		if (__predict_false(++limit > rx_done->mask / 2))
2818			break;
2819	}
2820#if defined(INET)  || defined (INET6)
2821	while (!SLIST_EMPTY(&ss->lc.lro_active)) {
2822		struct lro_entry *lro = SLIST_FIRST(&ss->lc.lro_active);
2823		SLIST_REMOVE_HEAD(&ss->lc.lro_active, next);
2824		tcp_lro_flush(&ss->lc, lro);
2825	}
2826#endif
2827}
2828
2829
2830static inline void
2831mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2832{
2833	struct ifnet *ifp;
2834	mxge_tx_ring_t *tx;
2835	struct mbuf *m;
2836	bus_dmamap_t map;
2837	int idx;
2838	int *flags;
2839
2840	tx = &ss->tx;
2841	ifp = ss->sc->ifp;
2842	while (tx->pkt_done != mcp_idx) {
2843		idx = tx->done & tx->mask;
2844		tx->done++;
2845		m = tx->info[idx].m;
2846		/* mbuf and DMA map only attached to the first
2847		   segment per-mbuf */
2848		if (m != NULL) {
2849			ss->obytes += m->m_pkthdr.len;
2850			if (m->m_flags & M_MCAST)
2851				ss->omcasts++;
2852			ss->opackets++;
2853			tx->info[idx].m = NULL;
2854			map = tx->info[idx].map;
2855			bus_dmamap_unload(tx->dmat, map);
2856			m_freem(m);
2857		}
2858		if (tx->info[idx].flag) {
2859			tx->info[idx].flag = 0;
2860			tx->pkt_done++;
2861		}
2862	}
2863
2864	/* If we have space, clear IFF_OACTIVE to tell the stack that
2865           its OK to send packets */
2866#ifdef IFNET_BUF_RING
2867	flags = &ss->if_drv_flags;
2868#else
2869	flags = &ifp->if_drv_flags;
2870#endif
2871	mtx_lock(&ss->tx.mtx);
2872	if ((*flags) & IFF_DRV_OACTIVE &&
2873	    tx->req - tx->done < (tx->mask + 1)/4) {
2874		*(flags) &= ~IFF_DRV_OACTIVE;
2875		ss->tx.wake++;
2876		mxge_start_locked(ss);
2877	}
2878#ifdef IFNET_BUF_RING
2879	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2880		/* let the NIC stop polling this queue, since there
2881		 * are no more transmits pending */
2882		if (tx->req == tx->done) {
2883			*tx->send_stop = 1;
2884			tx->queue_active = 0;
2885			tx->deactivate++;
2886			wmb();
2887		}
2888	}
2889#endif
2890	mtx_unlock(&ss->tx.mtx);
2891
2892}
2893
2894static struct mxge_media_type mxge_xfp_media_types[] =
2895{
2896	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2897	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2898	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2899	{0,		(1 << 5),	"10GBASE-ER"},
2900	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2901	{0,		(1 << 3),	"10GBASE-SW"},
2902	{0,		(1 << 2),	"10GBASE-LW"},
2903	{0,		(1 << 1),	"10GBASE-EW"},
2904	{0,		(1 << 0),	"Reserved"}
2905};
2906static struct mxge_media_type mxge_sfp_media_types[] =
2907{
2908	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2909	{0,		(1 << 7),	"Reserved"},
2910	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2911	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2912	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2913	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2914};
2915
2916static void
2917mxge_media_set(mxge_softc_t *sc, int media_type)
2918{
2919
2920
2921	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2922		    0, NULL);
2923	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2924	sc->current_media = media_type;
2925	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2926}
2927
2928static void
2929mxge_media_init(mxge_softc_t *sc)
2930{
2931	char *ptr;
2932	int i;
2933
2934	ifmedia_removeall(&sc->media);
2935	mxge_media_set(sc, IFM_AUTO);
2936
2937	/*
2938	 * parse the product code to deterimine the interface type
2939	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2940	 * after the 3rd dash in the driver's cached copy of the
2941	 * EEPROM's product code string.
2942	 */
2943	ptr = sc->product_code_string;
2944	if (ptr == NULL) {
2945		device_printf(sc->dev, "Missing product code\n");
2946		return;
2947	}
2948
2949	for (i = 0; i < 3; i++, ptr++) {
2950		ptr = strchr(ptr, '-');
2951		if (ptr == NULL) {
2952			device_printf(sc->dev,
2953				      "only %d dashes in PC?!?\n", i);
2954			return;
2955		}
2956	}
2957	if (*ptr == 'C' || *(ptr +1) == 'C') {
2958		/* -C is CX4 */
2959		sc->connector = MXGE_CX4;
2960		mxge_media_set(sc, IFM_10G_CX4);
2961	} else if (*ptr == 'Q') {
2962		/* -Q is Quad Ribbon Fiber */
2963		sc->connector = MXGE_QRF;
2964		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2965		/* FreeBSD has no media type for Quad ribbon fiber */
2966	} else if (*ptr == 'R') {
2967		/* -R is XFP */
2968		sc->connector = MXGE_XFP;
2969	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2970		/* -S or -2S is SFP+ */
2971		sc->connector = MXGE_SFP;
2972	} else {
2973		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2974	}
2975}
2976
2977/*
2978 * Determine the media type for a NIC.  Some XFPs will identify
2979 * themselves only when their link is up, so this is initiated via a
2980 * link up interrupt.  However, this can potentially take up to
2981 * several milliseconds, so it is run via the watchdog routine, rather
2982 * than in the interrupt handler itself.
2983 */
2984static void
2985mxge_media_probe(mxge_softc_t *sc)
2986{
2987	mxge_cmd_t cmd;
2988	char *cage_type;
2989
2990	struct mxge_media_type *mxge_media_types = NULL;
2991	int i, err, ms, mxge_media_type_entries;
2992	uint32_t byte;
2993
2994	sc->need_media_probe = 0;
2995
2996	if (sc->connector == MXGE_XFP) {
2997		/* -R is XFP */
2998		mxge_media_types = mxge_xfp_media_types;
2999		mxge_media_type_entries =
3000			sizeof (mxge_xfp_media_types) /
3001			sizeof (mxge_xfp_media_types[0]);
3002		byte = MXGE_XFP_COMPLIANCE_BYTE;
3003		cage_type = "XFP";
3004	} else 	if (sc->connector == MXGE_SFP) {
3005		/* -S or -2S is SFP+ */
3006		mxge_media_types = mxge_sfp_media_types;
3007		mxge_media_type_entries =
3008			sizeof (mxge_sfp_media_types) /
3009			sizeof (mxge_sfp_media_types[0]);
3010		cage_type = "SFP+";
3011		byte = 3;
3012	} else {
3013		/* nothing to do; media type cannot change */
3014		return;
3015	}
3016
3017	/*
3018	 * At this point we know the NIC has an XFP cage, so now we
3019	 * try to determine what is in the cage by using the
3020	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
3021	 * register.  We read just one byte, which may take over
3022	 * a millisecond
3023	 */
3024
3025	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3026	cmd.data1 = byte;
3027	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3028	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3029		device_printf(sc->dev, "failed to read XFP\n");
3030	}
3031	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3032		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3033	}
3034	if (err != MXGEFW_CMD_OK) {
3035		return;
3036	}
3037
3038	/* now we wait for the data to be cached */
3039	cmd.data0 = byte;
3040	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3041	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3042		DELAY(1000);
3043		cmd.data0 = byte;
3044		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3045	}
3046	if (err != MXGEFW_CMD_OK) {
3047		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3048			      cage_type, err, ms);
3049		return;
3050	}
3051
3052	if (cmd.data0 == mxge_media_types[0].bitmask) {
3053		if (mxge_verbose)
3054			device_printf(sc->dev, "%s:%s\n", cage_type,
3055				      mxge_media_types[0].name);
3056		if (sc->current_media != mxge_media_types[0].flag) {
3057			mxge_media_init(sc);
3058			mxge_media_set(sc, mxge_media_types[0].flag);
3059		}
3060		return;
3061	}
3062	for (i = 1; i < mxge_media_type_entries; i++) {
3063		if (cmd.data0 & mxge_media_types[i].bitmask) {
3064			if (mxge_verbose)
3065				device_printf(sc->dev, "%s:%s\n",
3066					      cage_type,
3067					      mxge_media_types[i].name);
3068
3069			if (sc->current_media != mxge_media_types[i].flag) {
3070				mxge_media_init(sc);
3071				mxge_media_set(sc, mxge_media_types[i].flag);
3072			}
3073			return;
3074		}
3075	}
3076	if (mxge_verbose)
3077		device_printf(sc->dev, "%s media 0x%x unknown\n",
3078			      cage_type, cmd.data0);
3079
3080	return;
3081}
3082
3083static void
3084mxge_intr(void *arg)
3085{
3086	struct mxge_slice_state *ss = arg;
3087	mxge_softc_t *sc = ss->sc;
3088	mcp_irq_data_t *stats = ss->fw_stats;
3089	mxge_tx_ring_t *tx = &ss->tx;
3090	mxge_rx_done_t *rx_done = &ss->rx_done;
3091	uint32_t send_done_count;
3092	uint8_t valid;
3093
3094
3095#ifndef IFNET_BUF_RING
3096	/* an interrupt on a non-zero slice is implicitly valid
3097	   since MSI-X irqs are not shared */
3098	if (ss != sc->ss) {
3099		mxge_clean_rx_done(ss);
3100		*ss->irq_claim = be32toh(3);
3101		return;
3102	}
3103#endif
3104
3105	/* make sure the DMA has finished */
3106	if (!stats->valid) {
3107		return;
3108	}
3109	valid = stats->valid;
3110
3111	if (sc->legacy_irq) {
3112		/* lower legacy IRQ  */
3113		*sc->irq_deassert = 0;
3114		if (!mxge_deassert_wait)
3115			/* don't wait for conf. that irq is low */
3116			stats->valid = 0;
3117	} else {
3118		stats->valid = 0;
3119	}
3120
3121	/* loop while waiting for legacy irq deassertion */
3122	do {
3123		/* check for transmit completes and receives */
3124		send_done_count = be32toh(stats->send_done_count);
3125		while ((send_done_count != tx->pkt_done) ||
3126		       (rx_done->entry[rx_done->idx].length != 0)) {
3127			if (send_done_count != tx->pkt_done)
3128				mxge_tx_done(ss, (int)send_done_count);
3129			mxge_clean_rx_done(ss);
3130			send_done_count = be32toh(stats->send_done_count);
3131		}
3132		if (sc->legacy_irq && mxge_deassert_wait)
3133			wmb();
3134	} while (*((volatile uint8_t *) &stats->valid));
3135
3136	/* fw link & error stats meaningful only on the first slice */
3137	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3138		if (sc->link_state != stats->link_up) {
3139			sc->link_state = stats->link_up;
3140			if (sc->link_state) {
3141				if_link_state_change(sc->ifp, LINK_STATE_UP);
3142				if_initbaudrate(sc->ifp, IF_Gbps(10));
3143				if (mxge_verbose)
3144					device_printf(sc->dev, "link up\n");
3145			} else {
3146				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3147				sc->ifp->if_baudrate = 0;
3148				if (mxge_verbose)
3149					device_printf(sc->dev, "link down\n");
3150			}
3151			sc->need_media_probe = 1;
3152		}
3153		if (sc->rdma_tags_available !=
3154		    be32toh(stats->rdma_tags_available)) {
3155			sc->rdma_tags_available =
3156				be32toh(stats->rdma_tags_available);
3157			device_printf(sc->dev, "RDMA timed out! %d tags "
3158				      "left\n", sc->rdma_tags_available);
3159		}
3160
3161		if (stats->link_down) {
3162			sc->down_cnt += stats->link_down;
3163			sc->link_state = 0;
3164			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3165		}
3166	}
3167
3168	/* check to see if we have rx token to pass back */
3169	if (valid & 0x1)
3170	    *ss->irq_claim = be32toh(3);
3171	*(ss->irq_claim + 1) = be32toh(3);
3172}
3173
3174static void
3175mxge_init(void *arg)
3176{
3177	mxge_softc_t *sc = arg;
3178	struct ifnet *ifp = sc->ifp;
3179
3180
3181	mtx_lock(&sc->driver_mtx);
3182	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3183		(void) mxge_open(sc);
3184	mtx_unlock(&sc->driver_mtx);
3185}
3186
3187
3188
3189static void
3190mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3191{
3192	int i;
3193
3194#if defined(INET) || defined(INET6)
3195	tcp_lro_free(&ss->lc);
3196#endif
3197	for (i = 0; i <= ss->rx_big.mask; i++) {
3198		if (ss->rx_big.info[i].m == NULL)
3199			continue;
3200		bus_dmamap_unload(ss->rx_big.dmat,
3201				  ss->rx_big.info[i].map);
3202		m_freem(ss->rx_big.info[i].m);
3203		ss->rx_big.info[i].m = NULL;
3204	}
3205
3206	for (i = 0; i <= ss->rx_small.mask; i++) {
3207		if (ss->rx_small.info[i].m == NULL)
3208			continue;
3209		bus_dmamap_unload(ss->rx_small.dmat,
3210				  ss->rx_small.info[i].map);
3211		m_freem(ss->rx_small.info[i].m);
3212		ss->rx_small.info[i].m = NULL;
3213	}
3214
3215	/* transmit ring used only on the first slice */
3216	if (ss->tx.info == NULL)
3217		return;
3218
3219	for (i = 0; i <= ss->tx.mask; i++) {
3220		ss->tx.info[i].flag = 0;
3221		if (ss->tx.info[i].m == NULL)
3222			continue;
3223		bus_dmamap_unload(ss->tx.dmat,
3224				  ss->tx.info[i].map);
3225		m_freem(ss->tx.info[i].m);
3226		ss->tx.info[i].m = NULL;
3227	}
3228}
3229
3230static void
3231mxge_free_mbufs(mxge_softc_t *sc)
3232{
3233	int slice;
3234
3235	for (slice = 0; slice < sc->num_slices; slice++)
3236		mxge_free_slice_mbufs(&sc->ss[slice]);
3237}
3238
3239static void
3240mxge_free_slice_rings(struct mxge_slice_state *ss)
3241{
3242	int i;
3243
3244
3245	if (ss->rx_done.entry != NULL)
3246		mxge_dma_free(&ss->rx_done.dma);
3247	ss->rx_done.entry = NULL;
3248
3249	if (ss->tx.req_bytes != NULL)
3250		free(ss->tx.req_bytes, M_DEVBUF);
3251	ss->tx.req_bytes = NULL;
3252
3253	if (ss->tx.seg_list != NULL)
3254		free(ss->tx.seg_list, M_DEVBUF);
3255	ss->tx.seg_list = NULL;
3256
3257	if (ss->rx_small.shadow != NULL)
3258		free(ss->rx_small.shadow, M_DEVBUF);
3259	ss->rx_small.shadow = NULL;
3260
3261	if (ss->rx_big.shadow != NULL)
3262		free(ss->rx_big.shadow, M_DEVBUF);
3263	ss->rx_big.shadow = NULL;
3264
3265	if (ss->tx.info != NULL) {
3266		if (ss->tx.dmat != NULL) {
3267			for (i = 0; i <= ss->tx.mask; i++) {
3268				bus_dmamap_destroy(ss->tx.dmat,
3269						   ss->tx.info[i].map);
3270			}
3271			bus_dma_tag_destroy(ss->tx.dmat);
3272		}
3273		free(ss->tx.info, M_DEVBUF);
3274	}
3275	ss->tx.info = NULL;
3276
3277	if (ss->rx_small.info != NULL) {
3278		if (ss->rx_small.dmat != NULL) {
3279			for (i = 0; i <= ss->rx_small.mask; i++) {
3280				bus_dmamap_destroy(ss->rx_small.dmat,
3281						   ss->rx_small.info[i].map);
3282			}
3283			bus_dmamap_destroy(ss->rx_small.dmat,
3284					   ss->rx_small.extra_map);
3285			bus_dma_tag_destroy(ss->rx_small.dmat);
3286		}
3287		free(ss->rx_small.info, M_DEVBUF);
3288	}
3289	ss->rx_small.info = NULL;
3290
3291	if (ss->rx_big.info != NULL) {
3292		if (ss->rx_big.dmat != NULL) {
3293			for (i = 0; i <= ss->rx_big.mask; i++) {
3294				bus_dmamap_destroy(ss->rx_big.dmat,
3295						   ss->rx_big.info[i].map);
3296			}
3297			bus_dmamap_destroy(ss->rx_big.dmat,
3298					   ss->rx_big.extra_map);
3299			bus_dma_tag_destroy(ss->rx_big.dmat);
3300		}
3301		free(ss->rx_big.info, M_DEVBUF);
3302	}
3303	ss->rx_big.info = NULL;
3304}
3305
3306static void
3307mxge_free_rings(mxge_softc_t *sc)
3308{
3309	int slice;
3310
3311	for (slice = 0; slice < sc->num_slices; slice++)
3312		mxge_free_slice_rings(&sc->ss[slice]);
3313}
3314
3315static int
3316mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3317		       int tx_ring_entries)
3318{
3319	mxge_softc_t *sc = ss->sc;
3320	size_t bytes;
3321	int err, i;
3322
3323	/* allocate per-slice receive resources */
3324
3325	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3326	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3327
3328	/* allocate the rx shadow rings */
3329	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3330	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3331
3332	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3333	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3334
3335	/* allocate the rx host info rings */
3336	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3337	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3338
3339	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3340	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3341
3342	/* allocate the rx busdma resources */
3343	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3344				 1,			/* alignment */
3345				 4096,			/* boundary */
3346				 BUS_SPACE_MAXADDR,	/* low */
3347				 BUS_SPACE_MAXADDR,	/* high */
3348				 NULL, NULL,		/* filter */
3349				 MHLEN,			/* maxsize */
3350				 1,			/* num segs */
3351				 MHLEN,			/* maxsegsize */
3352				 BUS_DMA_ALLOCNOW,	/* flags */
3353				 NULL, NULL,		/* lock */
3354				 &ss->rx_small.dmat);	/* tag */
3355	if (err != 0) {
3356		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3357			      err);
3358		return err;
3359	}
3360
3361	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3362				 1,			/* alignment */
3363#if MXGE_VIRT_JUMBOS
3364				 4096,			/* boundary */
3365#else
3366				 0,			/* boundary */
3367#endif
3368				 BUS_SPACE_MAXADDR,	/* low */
3369				 BUS_SPACE_MAXADDR,	/* high */
3370				 NULL, NULL,		/* filter */
3371				 3*4096,		/* maxsize */
3372#if MXGE_VIRT_JUMBOS
3373				 3,			/* num segs */
3374				 4096,			/* maxsegsize*/
3375#else
3376				 1,			/* num segs */
3377				 MJUM9BYTES,		/* maxsegsize*/
3378#endif
3379				 BUS_DMA_ALLOCNOW,	/* flags */
3380				 NULL, NULL,		/* lock */
3381				 &ss->rx_big.dmat);	/* tag */
3382	if (err != 0) {
3383		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3384			      err);
3385		return err;
3386	}
3387	for (i = 0; i <= ss->rx_small.mask; i++) {
3388		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3389					&ss->rx_small.info[i].map);
3390		if (err != 0) {
3391			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3392				      err);
3393			return err;
3394		}
3395	}
3396	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3397				&ss->rx_small.extra_map);
3398	if (err != 0) {
3399		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3400			      err);
3401		return err;
3402	}
3403
3404	for (i = 0; i <= ss->rx_big.mask; i++) {
3405		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3406					&ss->rx_big.info[i].map);
3407		if (err != 0) {
3408			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3409				      err);
3410			return err;
3411		}
3412	}
3413	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3414				&ss->rx_big.extra_map);
3415	if (err != 0) {
3416		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3417			      err);
3418		return err;
3419	}
3420
3421	/* now allocate TX resources */
3422
3423#ifndef IFNET_BUF_RING
3424	/* only use a single TX ring for now */
3425	if (ss != ss->sc->ss)
3426		return 0;
3427#endif
3428
3429	ss->tx.mask = tx_ring_entries - 1;
3430	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3431
3432
3433	/* allocate the tx request copy block */
3434	bytes = 8 +
3435		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3436	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3437	/* ensure req_list entries are aligned to 8 bytes */
3438	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3439		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3440
3441	/* allocate the tx busdma segment list */
3442	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3443	ss->tx.seg_list = (bus_dma_segment_t *)
3444		malloc(bytes, M_DEVBUF, M_WAITOK);
3445
3446	/* allocate the tx host info ring */
3447	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3448	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3449
3450	/* allocate the tx busdma resources */
3451	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3452				 1,			/* alignment */
3453				 sc->tx_boundary,	/* boundary */
3454				 BUS_SPACE_MAXADDR,	/* low */
3455				 BUS_SPACE_MAXADDR,	/* high */
3456				 NULL, NULL,		/* filter */
3457				 65536 + 256,		/* maxsize */
3458				 ss->tx.max_desc - 2,	/* num segs */
3459				 sc->tx_boundary,	/* maxsegsz */
3460				 BUS_DMA_ALLOCNOW,	/* flags */
3461				 NULL, NULL,		/* lock */
3462				 &ss->tx.dmat);		/* tag */
3463
3464	if (err != 0) {
3465		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3466			      err);
3467		return err;
3468	}
3469
3470	/* now use these tags to setup dmamaps for each slot
3471	   in the ring */
3472	for (i = 0; i <= ss->tx.mask; i++) {
3473		err = bus_dmamap_create(ss->tx.dmat, 0,
3474					&ss->tx.info[i].map);
3475		if (err != 0) {
3476			device_printf(sc->dev, "Err %d  tx dmamap\n",
3477				      err);
3478			return err;
3479		}
3480	}
3481	return 0;
3482
3483}
3484
3485static int
3486mxge_alloc_rings(mxge_softc_t *sc)
3487{
3488	mxge_cmd_t cmd;
3489	int tx_ring_size;
3490	int tx_ring_entries, rx_ring_entries;
3491	int err, slice;
3492
3493	/* get ring sizes */
3494	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3495	tx_ring_size = cmd.data0;
3496	if (err != 0) {
3497		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3498		goto abort;
3499	}
3500
3501	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3502	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3503	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3504	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3505	IFQ_SET_READY(&sc->ifp->if_snd);
3506
3507	for (slice = 0; slice < sc->num_slices; slice++) {
3508		err = mxge_alloc_slice_rings(&sc->ss[slice],
3509					     rx_ring_entries,
3510					     tx_ring_entries);
3511		if (err != 0)
3512			goto abort;
3513	}
3514	return 0;
3515
3516abort:
3517	mxge_free_rings(sc);
3518	return err;
3519
3520}
3521
3522
3523static void
3524mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3525{
3526	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3527
3528	if (bufsize < MCLBYTES) {
3529		/* easy, everything fits in a single buffer */
3530		*big_buf_size = MCLBYTES;
3531		*cl_size = MCLBYTES;
3532		*nbufs = 1;
3533		return;
3534	}
3535
3536	if (bufsize < MJUMPAGESIZE) {
3537		/* still easy, everything still fits in a single buffer */
3538		*big_buf_size = MJUMPAGESIZE;
3539		*cl_size = MJUMPAGESIZE;
3540		*nbufs = 1;
3541		return;
3542	}
3543#if MXGE_VIRT_JUMBOS
3544	/* now we need to use virtually contiguous buffers */
3545	*cl_size = MJUM9BYTES;
3546	*big_buf_size = 4096;
3547	*nbufs = mtu / 4096 + 1;
3548	/* needs to be a power of two, so round up */
3549	if (*nbufs == 3)
3550		*nbufs = 4;
3551#else
3552	*cl_size = MJUM9BYTES;
3553	*big_buf_size = MJUM9BYTES;
3554	*nbufs = 1;
3555#endif
3556}
3557
3558static int
3559mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3560{
3561	mxge_softc_t *sc;
3562	mxge_cmd_t cmd;
3563	bus_dmamap_t map;
3564	int err, i, slice;
3565
3566
3567	sc = ss->sc;
3568	slice = ss - sc->ss;
3569
3570#if defined(INET) || defined(INET6)
3571	(void)tcp_lro_init(&ss->lc);
3572#endif
3573	ss->lc.ifp = sc->ifp;
3574
3575	/* get the lanai pointers to the send and receive rings */
3576
3577	err = 0;
3578#ifndef IFNET_BUF_RING
3579	/* We currently only send from the first slice */
3580	if (slice == 0) {
3581#endif
3582		cmd.data0 = slice;
3583		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3584		ss->tx.lanai =
3585			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3586		ss->tx.send_go = (volatile uint32_t *)
3587			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3588		ss->tx.send_stop = (volatile uint32_t *)
3589		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3590#ifndef IFNET_BUF_RING
3591	}
3592#endif
3593	cmd.data0 = slice;
3594	err |= mxge_send_cmd(sc,
3595			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3596	ss->rx_small.lanai =
3597		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3598	cmd.data0 = slice;
3599	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3600	ss->rx_big.lanai =
3601		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3602
3603	if (err != 0) {
3604		device_printf(sc->dev,
3605			      "failed to get ring sizes or locations\n");
3606		return EIO;
3607	}
3608
3609	/* stock receive rings */
3610	for (i = 0; i <= ss->rx_small.mask; i++) {
3611		map = ss->rx_small.info[i].map;
3612		err = mxge_get_buf_small(ss, map, i);
3613		if (err) {
3614			device_printf(sc->dev, "alloced %d/%d smalls\n",
3615				      i, ss->rx_small.mask + 1);
3616			return ENOMEM;
3617		}
3618	}
3619	for (i = 0; i <= ss->rx_big.mask; i++) {
3620		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3621		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3622	}
3623	ss->rx_big.nbufs = nbufs;
3624	ss->rx_big.cl_size = cl_size;
3625	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3626		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3627	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3628		map = ss->rx_big.info[i].map;
3629		err = mxge_get_buf_big(ss, map, i);
3630		if (err) {
3631			device_printf(sc->dev, "alloced %d/%d bigs\n",
3632				      i, ss->rx_big.mask + 1);
3633			return ENOMEM;
3634		}
3635	}
3636	return 0;
3637}
3638
3639static int
3640mxge_open(mxge_softc_t *sc)
3641{
3642	mxge_cmd_t cmd;
3643	int err, big_bytes, nbufs, slice, cl_size, i;
3644	bus_addr_t bus;
3645	volatile uint8_t *itable;
3646	struct mxge_slice_state *ss;
3647
3648	/* Copy the MAC address in case it was overridden */
3649	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3650
3651	err = mxge_reset(sc, 1);
3652	if (err != 0) {
3653		device_printf(sc->dev, "failed to reset\n");
3654		return EIO;
3655	}
3656
3657	if (sc->num_slices > 1) {
3658		/* setup the indirection table */
3659		cmd.data0 = sc->num_slices;
3660		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3661				    &cmd);
3662
3663		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3664				     &cmd);
3665		if (err != 0) {
3666			device_printf(sc->dev,
3667				      "failed to setup rss tables\n");
3668			return err;
3669		}
3670
3671		/* just enable an identity mapping */
3672		itable = sc->sram + cmd.data0;
3673		for (i = 0; i < sc->num_slices; i++)
3674			itable[i] = (uint8_t)i;
3675
3676		cmd.data0 = 1;
3677		cmd.data1 = mxge_rss_hash_type;
3678		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3679		if (err != 0) {
3680			device_printf(sc->dev, "failed to enable slices\n");
3681			return err;
3682		}
3683	}
3684
3685
3686	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3687
3688	cmd.data0 = nbufs;
3689	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3690			    &cmd);
3691	/* error is only meaningful if we're trying to set
3692	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3693	if (err && nbufs > 1) {
3694		device_printf(sc->dev,
3695			      "Failed to set alway-use-n to %d\n",
3696			      nbufs);
3697		return EIO;
3698	}
3699	/* Give the firmware the mtu and the big and small buffer
3700	   sizes.  The firmware wants the big buf size to be a power
3701	   of two. Luckily, FreeBSD's clusters are powers of two */
3702	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3703	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3704	cmd.data0 = MHLEN - MXGEFW_PAD;
3705	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3706			     &cmd);
3707	cmd.data0 = big_bytes;
3708	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3709
3710	if (err != 0) {
3711		device_printf(sc->dev, "failed to setup params\n");
3712		goto abort;
3713	}
3714
3715	/* Now give him the pointer to the stats block */
3716	for (slice = 0;
3717#ifdef IFNET_BUF_RING
3718	     slice < sc->num_slices;
3719#else
3720	     slice < 1;
3721#endif
3722	     slice++) {
3723		ss = &sc->ss[slice];
3724		cmd.data0 =
3725			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3726		cmd.data1 =
3727			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3728		cmd.data2 = sizeof(struct mcp_irq_data);
3729		cmd.data2 |= (slice << 16);
3730		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3731	}
3732
3733	if (err != 0) {
3734		bus = sc->ss->fw_stats_dma.bus_addr;
3735		bus += offsetof(struct mcp_irq_data, send_done_count);
3736		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3737		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3738		err = mxge_send_cmd(sc,
3739				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3740				    &cmd);
3741		/* Firmware cannot support multicast without STATS_DMA_V2 */
3742		sc->fw_multicast_support = 0;
3743	} else {
3744		sc->fw_multicast_support = 1;
3745	}
3746
3747	if (err != 0) {
3748		device_printf(sc->dev, "failed to setup params\n");
3749		goto abort;
3750	}
3751
3752	for (slice = 0; slice < sc->num_slices; slice++) {
3753		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3754		if (err != 0) {
3755			device_printf(sc->dev, "couldn't open slice %d\n",
3756				      slice);
3757			goto abort;
3758		}
3759	}
3760
3761	/* Finally, start the firmware running */
3762	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3763	if (err) {
3764		device_printf(sc->dev, "Couldn't bring up link\n");
3765		goto abort;
3766	}
3767#ifdef IFNET_BUF_RING
3768	for (slice = 0; slice < sc->num_slices; slice++) {
3769		ss = &sc->ss[slice];
3770		ss->if_drv_flags |= IFF_DRV_RUNNING;
3771		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3772	}
3773#endif
3774	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3775	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3776
3777	return 0;
3778
3779
3780abort:
3781	mxge_free_mbufs(sc);
3782
3783	return err;
3784}
3785
3786static int
3787mxge_close(mxge_softc_t *sc, int down)
3788{
3789	mxge_cmd_t cmd;
3790	int err, old_down_cnt;
3791#ifdef IFNET_BUF_RING
3792	struct mxge_slice_state *ss;
3793	int slice;
3794#endif
3795
3796#ifdef IFNET_BUF_RING
3797	for (slice = 0; slice < sc->num_slices; slice++) {
3798		ss = &sc->ss[slice];
3799		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3800	}
3801#endif
3802	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3803	if (!down) {
3804		old_down_cnt = sc->down_cnt;
3805		wmb();
3806		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3807		if (err) {
3808			device_printf(sc->dev,
3809				      "Couldn't bring down link\n");
3810		}
3811		if (old_down_cnt == sc->down_cnt) {
3812			/* wait for down irq */
3813			DELAY(10 * sc->intr_coal_delay);
3814		}
3815		wmb();
3816		if (old_down_cnt == sc->down_cnt) {
3817			device_printf(sc->dev, "never got down irq\n");
3818		}
3819	}
3820	mxge_free_mbufs(sc);
3821
3822	return 0;
3823}
3824
3825static void
3826mxge_setup_cfg_space(mxge_softc_t *sc)
3827{
3828	device_t dev = sc->dev;
3829	int reg;
3830	uint16_t lnk, pectl;
3831
3832	/* find the PCIe link width and set max read request to 4KB*/
3833	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3834		lnk = pci_read_config(dev, reg + 0x12, 2);
3835		sc->link_width = (lnk >> 4) & 0x3f;
3836
3837		if (sc->pectl == 0) {
3838			pectl = pci_read_config(dev, reg + 0x8, 2);
3839			pectl = (pectl & ~0x7000) | (5 << 12);
3840			pci_write_config(dev, reg + 0x8, pectl, 2);
3841			sc->pectl = pectl;
3842		} else {
3843			/* restore saved pectl after watchdog reset */
3844			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3845		}
3846	}
3847
3848	/* Enable DMA and Memory space access */
3849	pci_enable_busmaster(dev);
3850}
3851
3852static uint32_t
3853mxge_read_reboot(mxge_softc_t *sc)
3854{
3855	device_t dev = sc->dev;
3856	uint32_t vs;
3857
3858	/* find the vendor specific offset */
3859	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3860		device_printf(sc->dev,
3861			      "could not find vendor specific offset\n");
3862		return (uint32_t)-1;
3863	}
3864	/* enable read32 mode */
3865	pci_write_config(dev, vs + 0x10, 0x3, 1);
3866	/* tell NIC which register to read */
3867	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3868	return (pci_read_config(dev, vs + 0x14, 4));
3869}
3870
3871static void
3872mxge_watchdog_reset(mxge_softc_t *sc)
3873{
3874	struct pci_devinfo *dinfo;
3875	struct mxge_slice_state *ss;
3876	int err, running, s, num_tx_slices = 1;
3877	uint32_t reboot;
3878	uint16_t cmd;
3879
3880	err = ENXIO;
3881
3882	device_printf(sc->dev, "Watchdog reset!\n");
3883
3884	/*
3885	 * check to see if the NIC rebooted.  If it did, then all of
3886	 * PCI config space has been reset, and things like the
3887	 * busmaster bit will be zero.  If this is the case, then we
3888	 * must restore PCI config space before the NIC can be used
3889	 * again
3890	 */
3891	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3892	if (cmd == 0xffff) {
3893		/*
3894		 * maybe the watchdog caught the NIC rebooting; wait
3895		 * up to 100ms for it to finish.  If it does not come
3896		 * back, then give up
3897		 */
3898		DELAY(1000*100);
3899		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3900		if (cmd == 0xffff) {
3901			device_printf(sc->dev, "NIC disappeared!\n");
3902		}
3903	}
3904	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3905		/* print the reboot status */
3906		reboot = mxge_read_reboot(sc);
3907		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3908			      reboot);
3909		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3910		if (running) {
3911
3912			/*
3913			 * quiesce NIC so that TX routines will not try to
3914			 * xmit after restoration of BAR
3915			 */
3916
3917			/* Mark the link as down */
3918			if (sc->link_state) {
3919				sc->link_state = 0;
3920				if_link_state_change(sc->ifp,
3921						     LINK_STATE_DOWN);
3922			}
3923#ifdef IFNET_BUF_RING
3924			num_tx_slices = sc->num_slices;
3925#endif
3926			/* grab all TX locks to ensure no tx  */
3927			for (s = 0; s < num_tx_slices; s++) {
3928				ss = &sc->ss[s];
3929				mtx_lock(&ss->tx.mtx);
3930			}
3931			mxge_close(sc, 1);
3932		}
3933		/* restore PCI configuration space */
3934		dinfo = device_get_ivars(sc->dev);
3935		pci_cfg_restore(sc->dev, dinfo);
3936
3937		/* and redo any changes we made to our config space */
3938		mxge_setup_cfg_space(sc);
3939
3940		/* reload f/w */
3941		err = mxge_load_firmware(sc, 0);
3942		if (err) {
3943			device_printf(sc->dev,
3944				      "Unable to re-load f/w\n");
3945		}
3946		if (running) {
3947			if (!err)
3948				err = mxge_open(sc);
3949			/* release all TX locks */
3950			for (s = 0; s < num_tx_slices; s++) {
3951				ss = &sc->ss[s];
3952#ifdef IFNET_BUF_RING
3953				mxge_start_locked(ss);
3954#endif
3955				mtx_unlock(&ss->tx.mtx);
3956			}
3957		}
3958		sc->watchdog_resets++;
3959	} else {
3960		device_printf(sc->dev,
3961			      "NIC did not reboot, not resetting\n");
3962		err = 0;
3963	}
3964	if (err) {
3965		device_printf(sc->dev, "watchdog reset failed\n");
3966	} else {
3967		if (sc->dying == 2)
3968			sc->dying = 0;
3969		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3970	}
3971}
3972
3973static void
3974mxge_watchdog_task(void *arg, int pending)
3975{
3976	mxge_softc_t *sc = arg;
3977
3978
3979	mtx_lock(&sc->driver_mtx);
3980	mxge_watchdog_reset(sc);
3981	mtx_unlock(&sc->driver_mtx);
3982}
3983
3984static void
3985mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3986{
3987	tx = &sc->ss[slice].tx;
3988	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3989	device_printf(sc->dev,
3990		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3991		      tx->req, tx->done, tx->queue_active);
3992	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3993			      tx->activate, tx->deactivate);
3994	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3995		      tx->pkt_done,
3996		      be32toh(sc->ss->fw_stats->send_done_count));
3997}
3998
3999static int
4000mxge_watchdog(mxge_softc_t *sc)
4001{
4002	mxge_tx_ring_t *tx;
4003	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
4004	int i, err = 0;
4005
4006	/* see if we have outstanding transmits, which
4007	   have been pending for more than mxge_ticks */
4008	for (i = 0;
4009#ifdef IFNET_BUF_RING
4010	     (i < sc->num_slices) && (err == 0);
4011#else
4012	     (i < 1) && (err == 0);
4013#endif
4014	     i++) {
4015		tx = &sc->ss[i].tx;
4016		if (tx->req != tx->done &&
4017		    tx->watchdog_req != tx->watchdog_done &&
4018		    tx->done == tx->watchdog_done) {
4019			/* check for pause blocking before resetting */
4020			if (tx->watchdog_rx_pause == rx_pause) {
4021				mxge_warn_stuck(sc, tx, i);
4022				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4023				return (ENXIO);
4024			}
4025			else
4026				device_printf(sc->dev, "Flow control blocking "
4027					      "xmits, check link partner\n");
4028		}
4029
4030		tx->watchdog_req = tx->req;
4031		tx->watchdog_done = tx->done;
4032		tx->watchdog_rx_pause = rx_pause;
4033	}
4034
4035	if (sc->need_media_probe)
4036		mxge_media_probe(sc);
4037	return (err);
4038}
4039
4040static u_long
4041mxge_update_stats(mxge_softc_t *sc)
4042{
4043	struct mxge_slice_state *ss;
4044	u_long pkts = 0;
4045	u_long ipackets = 0;
4046	u_long opackets = 0;
4047#ifdef IFNET_BUF_RING
4048	u_long obytes = 0;
4049	u_long omcasts = 0;
4050	u_long odrops = 0;
4051#endif
4052	u_long oerrors = 0;
4053	int slice;
4054
4055	for (slice = 0; slice < sc->num_slices; slice++) {
4056		ss = &sc->ss[slice];
4057		ipackets += ss->ipackets;
4058		opackets += ss->opackets;
4059#ifdef IFNET_BUF_RING
4060		obytes += ss->obytes;
4061		omcasts += ss->omcasts;
4062		odrops += ss->tx.br->br_drops;
4063#endif
4064		oerrors += ss->oerrors;
4065	}
4066	pkts = (ipackets - sc->ifp->if_ipackets);
4067	pkts += (opackets - sc->ifp->if_opackets);
4068	sc->ifp->if_ipackets = ipackets;
4069	sc->ifp->if_opackets = opackets;
4070#ifdef IFNET_BUF_RING
4071	sc->ifp->if_obytes = obytes;
4072	sc->ifp->if_omcasts = omcasts;
4073	sc->ifp->if_snd.ifq_drops = odrops;
4074#endif
4075	sc->ifp->if_oerrors = oerrors;
4076	return pkts;
4077}
4078
4079static void
4080mxge_tick(void *arg)
4081{
4082	mxge_softc_t *sc = arg;
4083	u_long pkts = 0;
4084	int err = 0;
4085	int running, ticks;
4086	uint16_t cmd;
4087
4088	ticks = mxge_ticks;
4089	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4090	if (running) {
4091		/* aggregate stats from different slices */
4092		pkts = mxge_update_stats(sc);
4093		if (!sc->watchdog_countdown) {
4094			err = mxge_watchdog(sc);
4095			sc->watchdog_countdown = 4;
4096		}
4097		sc->watchdog_countdown--;
4098	}
4099	if (pkts == 0) {
4100		/* ensure NIC did not suffer h/w fault while idle */
4101		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4102		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4103			sc->dying = 2;
4104			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4105			err = ENXIO;
4106		}
4107		/* look less often if NIC is idle */
4108		ticks *= 4;
4109	}
4110
4111	if (err == 0)
4112		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4113
4114}
4115
4116static int
4117mxge_media_change(struct ifnet *ifp)
4118{
4119	return EINVAL;
4120}
4121
4122static int
4123mxge_change_mtu(mxge_softc_t *sc, int mtu)
4124{
4125	struct ifnet *ifp = sc->ifp;
4126	int real_mtu, old_mtu;
4127	int err = 0;
4128
4129
4130	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4131	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4132		return EINVAL;
4133	mtx_lock(&sc->driver_mtx);
4134	old_mtu = ifp->if_mtu;
4135	ifp->if_mtu = mtu;
4136	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4137		mxge_close(sc, 0);
4138		err = mxge_open(sc);
4139		if (err != 0) {
4140			ifp->if_mtu = old_mtu;
4141			mxge_close(sc, 0);
4142			(void) mxge_open(sc);
4143		}
4144	}
4145	mtx_unlock(&sc->driver_mtx);
4146	return err;
4147}
4148
4149static void
4150mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4151{
4152	mxge_softc_t *sc = ifp->if_softc;
4153
4154
4155	if (sc == NULL)
4156		return;
4157	ifmr->ifm_status = IFM_AVALID;
4158	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4159	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4160	ifmr->ifm_active |= sc->current_media;
4161}
4162
4163static int
4164mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4165{
4166	mxge_softc_t *sc = ifp->if_softc;
4167	struct ifreq *ifr = (struct ifreq *)data;
4168	int err, mask;
4169
4170	err = 0;
4171	switch (command) {
4172	case SIOCSIFADDR:
4173	case SIOCGIFADDR:
4174		err = ether_ioctl(ifp, command, data);
4175		break;
4176
4177	case SIOCSIFMTU:
4178		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4179		break;
4180
4181	case SIOCSIFFLAGS:
4182		mtx_lock(&sc->driver_mtx);
4183		if (sc->dying) {
4184			mtx_unlock(&sc->driver_mtx);
4185			return EINVAL;
4186		}
4187		if (ifp->if_flags & IFF_UP) {
4188			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4189				err = mxge_open(sc);
4190			} else {
4191				/* take care of promis can allmulti
4192				   flag chages */
4193				mxge_change_promisc(sc,
4194						    ifp->if_flags & IFF_PROMISC);
4195				mxge_set_multicast_list(sc);
4196			}
4197		} else {
4198			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4199				mxge_close(sc, 0);
4200			}
4201		}
4202		mtx_unlock(&sc->driver_mtx);
4203		break;
4204
4205	case SIOCADDMULTI:
4206	case SIOCDELMULTI:
4207		mtx_lock(&sc->driver_mtx);
4208		mxge_set_multicast_list(sc);
4209		mtx_unlock(&sc->driver_mtx);
4210		break;
4211
4212	case SIOCSIFCAP:
4213		mtx_lock(&sc->driver_mtx);
4214		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4215		if (mask & IFCAP_TXCSUM) {
4216			if (IFCAP_TXCSUM & ifp->if_capenable) {
4217				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4218				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4219			} else {
4220				ifp->if_capenable |= IFCAP_TXCSUM;
4221				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4222			}
4223		} else if (mask & IFCAP_RXCSUM) {
4224			if (IFCAP_RXCSUM & ifp->if_capenable) {
4225				ifp->if_capenable &= ~IFCAP_RXCSUM;
4226			} else {
4227				ifp->if_capenable |= IFCAP_RXCSUM;
4228			}
4229		}
4230		if (mask & IFCAP_TSO4) {
4231			if (IFCAP_TSO4 & ifp->if_capenable) {
4232				ifp->if_capenable &= ~IFCAP_TSO4;
4233			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4234				ifp->if_capenable |= IFCAP_TSO4;
4235				ifp->if_hwassist |= CSUM_TSO;
4236			} else {
4237				printf("mxge requires tx checksum offload"
4238				       " be enabled to use TSO\n");
4239				err = EINVAL;
4240			}
4241		}
4242#if IFCAP_TSO6
4243		if (mask & IFCAP_TXCSUM_IPV6) {
4244			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4245				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4246						       | IFCAP_TSO6);
4247				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4248						      | CSUM_UDP);
4249			} else {
4250				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4251				ifp->if_hwassist |= (CSUM_TCP_IPV6
4252						     | CSUM_UDP_IPV6);
4253			}
4254		} else if (mask & IFCAP_RXCSUM_IPV6) {
4255			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4256				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4257			} else {
4258				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4259			}
4260		}
4261		if (mask & IFCAP_TSO6) {
4262			if (IFCAP_TSO6 & ifp->if_capenable) {
4263				ifp->if_capenable &= ~IFCAP_TSO6;
4264			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4265				ifp->if_capenable |= IFCAP_TSO6;
4266				ifp->if_hwassist |= CSUM_TSO;
4267			} else {
4268				printf("mxge requires tx checksum offload"
4269				       " be enabled to use TSO\n");
4270				err = EINVAL;
4271			}
4272		}
4273#endif /*IFCAP_TSO6 */
4274
4275		if (mask & IFCAP_LRO)
4276			ifp->if_capenable ^= IFCAP_LRO;
4277		if (mask & IFCAP_VLAN_HWTAGGING)
4278			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4279		if (mask & IFCAP_VLAN_HWTSO)
4280			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4281
4282		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4283		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4284			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4285
4286		mtx_unlock(&sc->driver_mtx);
4287		VLAN_CAPABILITIES(ifp);
4288
4289		break;
4290
4291	case SIOCGIFMEDIA:
4292		mtx_lock(&sc->driver_mtx);
4293		mxge_media_probe(sc);
4294		mtx_unlock(&sc->driver_mtx);
4295		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4296				    &sc->media, command);
4297                break;
4298
4299	default:
4300		err = ENOTTY;
4301        }
4302	return err;
4303}
4304
4305static void
4306mxge_fetch_tunables(mxge_softc_t *sc)
4307{
4308
4309	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4310	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4311			  &mxge_flow_control);
4312	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4313			  &mxge_intr_coal_delay);
4314	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4315			  &mxge_nvidia_ecrc_enable);
4316	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4317			  &mxge_force_firmware);
4318	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4319			  &mxge_deassert_wait);
4320	TUNABLE_INT_FETCH("hw.mxge.verbose",
4321			  &mxge_verbose);
4322	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4323	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4324	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4325	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4326	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4327	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4328
4329	if (bootverbose)
4330		mxge_verbose = 1;
4331	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4332		mxge_intr_coal_delay = 30;
4333	if (mxge_ticks == 0)
4334		mxge_ticks = hz / 2;
4335	sc->pause = mxge_flow_control;
4336	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4337	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4338		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4339	}
4340	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4341	    mxge_initial_mtu < ETHER_MIN_LEN)
4342		mxge_initial_mtu = ETHERMTU_JUMBO;
4343
4344	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4345		mxge_throttle = MXGE_MAX_THROTTLE;
4346	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4347		mxge_throttle = MXGE_MIN_THROTTLE;
4348	sc->throttle = mxge_throttle;
4349}
4350
4351
4352static void
4353mxge_free_slices(mxge_softc_t *sc)
4354{
4355	struct mxge_slice_state *ss;
4356	int i;
4357
4358
4359	if (sc->ss == NULL)
4360		return;
4361
4362	for (i = 0; i < sc->num_slices; i++) {
4363		ss = &sc->ss[i];
4364		if (ss->fw_stats != NULL) {
4365			mxge_dma_free(&ss->fw_stats_dma);
4366			ss->fw_stats = NULL;
4367#ifdef IFNET_BUF_RING
4368			if (ss->tx.br != NULL) {
4369				drbr_free(ss->tx.br, M_DEVBUF);
4370				ss->tx.br = NULL;
4371			}
4372#endif
4373			mtx_destroy(&ss->tx.mtx);
4374		}
4375		if (ss->rx_done.entry != NULL) {
4376			mxge_dma_free(&ss->rx_done.dma);
4377			ss->rx_done.entry = NULL;
4378		}
4379	}
4380	free(sc->ss, M_DEVBUF);
4381	sc->ss = NULL;
4382}
4383
4384static int
4385mxge_alloc_slices(mxge_softc_t *sc)
4386{
4387	mxge_cmd_t cmd;
4388	struct mxge_slice_state *ss;
4389	size_t bytes;
4390	int err, i, max_intr_slots;
4391
4392	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4393	if (err != 0) {
4394		device_printf(sc->dev, "Cannot determine rx ring size\n");
4395		return err;
4396	}
4397	sc->rx_ring_size = cmd.data0;
4398	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4399
4400	bytes = sizeof (*sc->ss) * sc->num_slices;
4401	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4402	if (sc->ss == NULL)
4403		return (ENOMEM);
4404	for (i = 0; i < sc->num_slices; i++) {
4405		ss = &sc->ss[i];
4406
4407		ss->sc = sc;
4408
4409		/* allocate per-slice rx interrupt queues */
4410
4411		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4412		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4413		if (err != 0)
4414			goto abort;
4415		ss->rx_done.entry = ss->rx_done.dma.addr;
4416		bzero(ss->rx_done.entry, bytes);
4417
4418		/*
4419		 * allocate the per-slice firmware stats; stats
4420		 * (including tx) are used used only on the first
4421		 * slice for now
4422		 */
4423#ifndef IFNET_BUF_RING
4424		if (i > 0)
4425			continue;
4426#endif
4427
4428		bytes = sizeof (*ss->fw_stats);
4429		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4430				     sizeof (*ss->fw_stats), 64);
4431		if (err != 0)
4432			goto abort;
4433		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4434		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4435			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4436		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4437#ifdef IFNET_BUF_RING
4438		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4439					   &ss->tx.mtx);
4440#endif
4441	}
4442
4443	return (0);
4444
4445abort:
4446	mxge_free_slices(sc);
4447	return (ENOMEM);
4448}
4449
4450static void
4451mxge_slice_probe(mxge_softc_t *sc)
4452{
4453	mxge_cmd_t cmd;
4454	char *old_fw;
4455	int msix_cnt, status, max_intr_slots;
4456
4457	sc->num_slices = 1;
4458	/*
4459	 *  don't enable multiple slices if they are not enabled,
4460	 *  or if this is not an SMP system
4461	 */
4462
4463	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4464		return;
4465
4466	/* see how many MSI-X interrupts are available */
4467	msix_cnt = pci_msix_count(sc->dev);
4468	if (msix_cnt < 2)
4469		return;
4470
4471	/* now load the slice aware firmware see what it supports */
4472	old_fw = sc->fw_name;
4473	if (old_fw == mxge_fw_aligned)
4474		sc->fw_name = mxge_fw_rss_aligned;
4475	else
4476		sc->fw_name = mxge_fw_rss_unaligned;
4477	status = mxge_load_firmware(sc, 0);
4478	if (status != 0) {
4479		device_printf(sc->dev, "Falling back to a single slice\n");
4480		return;
4481	}
4482
4483	/* try to send a reset command to the card to see if it
4484	   is alive */
4485	memset(&cmd, 0, sizeof (cmd));
4486	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4487	if (status != 0) {
4488		device_printf(sc->dev, "failed reset\n");
4489		goto abort_with_fw;
4490	}
4491
4492	/* get rx ring size */
4493	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4494	if (status != 0) {
4495		device_printf(sc->dev, "Cannot determine rx ring size\n");
4496		goto abort_with_fw;
4497	}
4498	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4499
4500	/* tell it the size of the interrupt queues */
4501	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4502	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4503	if (status != 0) {
4504		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4505		goto abort_with_fw;
4506	}
4507
4508	/* ask the maximum number of slices it supports */
4509	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4510	if (status != 0) {
4511		device_printf(sc->dev,
4512			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4513		goto abort_with_fw;
4514	}
4515	sc->num_slices = cmd.data0;
4516	if (sc->num_slices > msix_cnt)
4517		sc->num_slices = msix_cnt;
4518
4519	if (mxge_max_slices == -1) {
4520		/* cap to number of CPUs in system */
4521		if (sc->num_slices > mp_ncpus)
4522			sc->num_slices = mp_ncpus;
4523	} else {
4524		if (sc->num_slices > mxge_max_slices)
4525			sc->num_slices = mxge_max_slices;
4526	}
4527	/* make sure it is a power of two */
4528	while (sc->num_slices & (sc->num_slices - 1))
4529		sc->num_slices--;
4530
4531	if (mxge_verbose)
4532		device_printf(sc->dev, "using %d slices\n",
4533			      sc->num_slices);
4534
4535	return;
4536
4537abort_with_fw:
4538	sc->fw_name = old_fw;
4539	(void) mxge_load_firmware(sc, 0);
4540}
4541
4542static int
4543mxge_add_msix_irqs(mxge_softc_t *sc)
4544{
4545	size_t bytes;
4546	int count, err, i, rid;
4547
4548	rid = PCIR_BAR(2);
4549	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4550						    &rid, RF_ACTIVE);
4551
4552	if (sc->msix_table_res == NULL) {
4553		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4554		return ENXIO;
4555	}
4556
4557	count = sc->num_slices;
4558	err = pci_alloc_msix(sc->dev, &count);
4559	if (err != 0) {
4560		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4561			      "err = %d \n", sc->num_slices, err);
4562		goto abort_with_msix_table;
4563	}
4564	if (count < sc->num_slices) {
4565		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4566			      count, sc->num_slices);
4567		device_printf(sc->dev,
4568			      "Try setting hw.mxge.max_slices to %d\n",
4569			      count);
4570		err = ENOSPC;
4571		goto abort_with_msix;
4572	}
4573	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4574	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4575	if (sc->msix_irq_res == NULL) {
4576		err = ENOMEM;
4577		goto abort_with_msix;
4578	}
4579
4580	for (i = 0; i < sc->num_slices; i++) {
4581		rid = i + 1;
4582		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4583							  SYS_RES_IRQ,
4584							  &rid, RF_ACTIVE);
4585		if (sc->msix_irq_res[i] == NULL) {
4586			device_printf(sc->dev, "couldn't allocate IRQ res"
4587				      " for message %d\n", i);
4588			err = ENXIO;
4589			goto abort_with_res;
4590		}
4591	}
4592
4593	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4594	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4595
4596	for (i = 0; i < sc->num_slices; i++) {
4597		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4598				     INTR_TYPE_NET | INTR_MPSAFE,
4599#if __FreeBSD_version > 700030
4600				     NULL,
4601#endif
4602				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4603		if (err != 0) {
4604			device_printf(sc->dev, "couldn't setup intr for "
4605				      "message %d\n", i);
4606			goto abort_with_intr;
4607		}
4608		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4609				  sc->msix_ih[i], "s%d", i);
4610	}
4611
4612	if (mxge_verbose) {
4613		device_printf(sc->dev, "using %d msix IRQs:",
4614			      sc->num_slices);
4615		for (i = 0; i < sc->num_slices; i++)
4616			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4617		printf("\n");
4618	}
4619	return (0);
4620
4621abort_with_intr:
4622	for (i = 0; i < sc->num_slices; i++) {
4623		if (sc->msix_ih[i] != NULL) {
4624			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4625					  sc->msix_ih[i]);
4626			sc->msix_ih[i] = NULL;
4627		}
4628	}
4629	free(sc->msix_ih, M_DEVBUF);
4630
4631
4632abort_with_res:
4633	for (i = 0; i < sc->num_slices; i++) {
4634		rid = i + 1;
4635		if (sc->msix_irq_res[i] != NULL)
4636			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4637					     sc->msix_irq_res[i]);
4638		sc->msix_irq_res[i] = NULL;
4639	}
4640	free(sc->msix_irq_res, M_DEVBUF);
4641
4642
4643abort_with_msix:
4644	pci_release_msi(sc->dev);
4645
4646abort_with_msix_table:
4647	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4648			     sc->msix_table_res);
4649
4650	return err;
4651}
4652
4653static int
4654mxge_add_single_irq(mxge_softc_t *sc)
4655{
4656	int count, err, rid;
4657
4658	count = pci_msi_count(sc->dev);
4659	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4660		rid = 1;
4661	} else {
4662		rid = 0;
4663		sc->legacy_irq = 1;
4664	}
4665	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4666					 1, RF_SHAREABLE | RF_ACTIVE);
4667	if (sc->irq_res == NULL) {
4668		device_printf(sc->dev, "could not alloc interrupt\n");
4669		return ENXIO;
4670	}
4671	if (mxge_verbose)
4672		device_printf(sc->dev, "using %s irq %ld\n",
4673			      sc->legacy_irq ? "INTx" : "MSI",
4674			      rman_get_start(sc->irq_res));
4675	err = bus_setup_intr(sc->dev, sc->irq_res,
4676			     INTR_TYPE_NET | INTR_MPSAFE,
4677#if __FreeBSD_version > 700030
4678			     NULL,
4679#endif
4680			     mxge_intr, &sc->ss[0], &sc->ih);
4681	if (err != 0) {
4682		bus_release_resource(sc->dev, SYS_RES_IRQ,
4683				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4684		if (!sc->legacy_irq)
4685			pci_release_msi(sc->dev);
4686	}
4687	return err;
4688}
4689
4690static void
4691mxge_rem_msix_irqs(mxge_softc_t *sc)
4692{
4693	int i, rid;
4694
4695	for (i = 0; i < sc->num_slices; i++) {
4696		if (sc->msix_ih[i] != NULL) {
4697			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4698					  sc->msix_ih[i]);
4699			sc->msix_ih[i] = NULL;
4700		}
4701	}
4702	free(sc->msix_ih, M_DEVBUF);
4703
4704	for (i = 0; i < sc->num_slices; i++) {
4705		rid = i + 1;
4706		if (sc->msix_irq_res[i] != NULL)
4707			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4708					     sc->msix_irq_res[i]);
4709		sc->msix_irq_res[i] = NULL;
4710	}
4711	free(sc->msix_irq_res, M_DEVBUF);
4712
4713	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4714			     sc->msix_table_res);
4715
4716	pci_release_msi(sc->dev);
4717	return;
4718}
4719
4720static void
4721mxge_rem_single_irq(mxge_softc_t *sc)
4722{
4723	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4724	bus_release_resource(sc->dev, SYS_RES_IRQ,
4725			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4726	if (!sc->legacy_irq)
4727		pci_release_msi(sc->dev);
4728}
4729
4730static void
4731mxge_rem_irq(mxge_softc_t *sc)
4732{
4733	if (sc->num_slices > 1)
4734		mxge_rem_msix_irqs(sc);
4735	else
4736		mxge_rem_single_irq(sc);
4737}
4738
4739static int
4740mxge_add_irq(mxge_softc_t *sc)
4741{
4742	int err;
4743
4744	if (sc->num_slices > 1)
4745		err = mxge_add_msix_irqs(sc);
4746	else
4747		err = mxge_add_single_irq(sc);
4748
4749	if (0 && err == 0 && sc->num_slices > 1) {
4750		mxge_rem_msix_irqs(sc);
4751		err = mxge_add_msix_irqs(sc);
4752	}
4753	return err;
4754}
4755
4756
4757static int
4758mxge_attach(device_t dev)
4759{
4760	mxge_cmd_t cmd;
4761	mxge_softc_t *sc = device_get_softc(dev);
4762	struct ifnet *ifp;
4763	int err, rid;
4764
4765	sc->dev = dev;
4766	mxge_fetch_tunables(sc);
4767
4768	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4769	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4770				  taskqueue_thread_enqueue, &sc->tq);
4771	if (sc->tq == NULL) {
4772		err = ENOMEM;
4773		goto abort_with_nothing;
4774	}
4775
4776	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4777				 1,			/* alignment */
4778				 0,			/* boundary */
4779				 BUS_SPACE_MAXADDR,	/* low */
4780				 BUS_SPACE_MAXADDR,	/* high */
4781				 NULL, NULL,		/* filter */
4782				 65536 + 256,		/* maxsize */
4783				 MXGE_MAX_SEND_DESC, 	/* num segs */
4784				 65536,			/* maxsegsize */
4785				 0,			/* flags */
4786				 NULL, NULL,		/* lock */
4787				 &sc->parent_dmat);	/* tag */
4788
4789	if (err != 0) {
4790		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4791			      err);
4792		goto abort_with_tq;
4793	}
4794
4795	ifp = sc->ifp = if_alloc(IFT_ETHER);
4796	if (ifp == NULL) {
4797		device_printf(dev, "can not if_alloc()\n");
4798		err = ENOSPC;
4799		goto abort_with_parent_dmat;
4800	}
4801	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4802
4803	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4804		 device_get_nameunit(dev));
4805	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4806	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4807		 "%s:drv", device_get_nameunit(dev));
4808	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4809		 MTX_NETWORK_LOCK, MTX_DEF);
4810
4811	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4812
4813	mxge_setup_cfg_space(sc);
4814
4815	/* Map the board into the kernel */
4816	rid = PCIR_BARS;
4817	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4818					 ~0, 1, RF_ACTIVE);
4819	if (sc->mem_res == NULL) {
4820		device_printf(dev, "could not map memory\n");
4821		err = ENXIO;
4822		goto abort_with_lock;
4823	}
4824	sc->sram = rman_get_virtual(sc->mem_res);
4825	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4826	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4827		device_printf(dev, "impossible memory region size %ld\n",
4828			      rman_get_size(sc->mem_res));
4829		err = ENXIO;
4830		goto abort_with_mem_res;
4831	}
4832
4833	/* make NULL terminated copy of the EEPROM strings section of
4834	   lanai SRAM */
4835	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4836	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4837				rman_get_bushandle(sc->mem_res),
4838				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4839				sc->eeprom_strings,
4840				MXGE_EEPROM_STRINGS_SIZE - 2);
4841	err = mxge_parse_strings(sc);
4842	if (err != 0)
4843		goto abort_with_mem_res;
4844
4845	/* Enable write combining for efficient use of PCIe bus */
4846	mxge_enable_wc(sc);
4847
4848	/* Allocate the out of band dma memory */
4849	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4850			     sizeof (mxge_cmd_t), 64);
4851	if (err != 0)
4852		goto abort_with_mem_res;
4853	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4854	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4855	if (err != 0)
4856		goto abort_with_cmd_dma;
4857
4858	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4859	if (err != 0)
4860		goto abort_with_zeropad_dma;
4861
4862	/* select & load the firmware */
4863	err = mxge_select_firmware(sc);
4864	if (err != 0)
4865		goto abort_with_dmabench;
4866	sc->intr_coal_delay = mxge_intr_coal_delay;
4867
4868	mxge_slice_probe(sc);
4869	err = mxge_alloc_slices(sc);
4870	if (err != 0)
4871		goto abort_with_dmabench;
4872
4873	err = mxge_reset(sc, 0);
4874	if (err != 0)
4875		goto abort_with_slices;
4876
4877	err = mxge_alloc_rings(sc);
4878	if (err != 0) {
4879		device_printf(sc->dev, "failed to allocate rings\n");
4880		goto abort_with_slices;
4881	}
4882
4883	err = mxge_add_irq(sc);
4884	if (err != 0) {
4885		device_printf(sc->dev, "failed to add irq\n");
4886		goto abort_with_rings;
4887	}
4888
4889	if_initbaudrate(ifp, IF_Gbps(10));
4890	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4891		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4892		IFCAP_RXCSUM_IPV6;
4893#if defined(INET) || defined(INET6)
4894	ifp->if_capabilities |= IFCAP_LRO;
4895#endif
4896
4897#ifdef MXGE_NEW_VLAN_API
4898	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4899
4900	/* Only FW 1.4.32 and newer can do TSO over vlans */
4901	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4902	    sc->fw_ver_tiny >= 32)
4903		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4904#endif
4905	sc->max_mtu = mxge_max_mtu(sc);
4906	if (sc->max_mtu >= 9000)
4907		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4908	else
4909		device_printf(dev, "MTU limited to %d.  Install "
4910			      "latest firmware for 9000 byte jumbo support\n",
4911			      sc->max_mtu - ETHER_HDR_LEN);
4912	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4913	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4914	/* check to see if f/w supports TSO for IPv6 */
4915	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4916		if (CSUM_TCP_IPV6)
4917			ifp->if_capabilities |= IFCAP_TSO6;
4918		sc->max_tso6_hlen = min(cmd.data0,
4919					sizeof (sc->ss[0].scratch));
4920	}
4921	ifp->if_capenable = ifp->if_capabilities;
4922	if (sc->lro_cnt == 0)
4923		ifp->if_capenable &= ~IFCAP_LRO;
4924        ifp->if_init = mxge_init;
4925        ifp->if_softc = sc;
4926        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4927        ifp->if_ioctl = mxge_ioctl;
4928        ifp->if_start = mxge_start;
4929	/* Initialise the ifmedia structure */
4930	ifmedia_init(&sc->media, 0, mxge_media_change,
4931		     mxge_media_status);
4932	mxge_media_init(sc);
4933	mxge_media_probe(sc);
4934	sc->dying = 0;
4935	ether_ifattach(ifp, sc->mac_addr);
4936	/* ether_ifattach sets mtu to ETHERMTU */
4937	if (mxge_initial_mtu != ETHERMTU)
4938		mxge_change_mtu(sc, mxge_initial_mtu);
4939
4940	mxge_add_sysctls(sc);
4941#ifdef IFNET_BUF_RING
4942	ifp->if_transmit = mxge_transmit;
4943	ifp->if_qflush = mxge_qflush;
4944#endif
4945	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4946				device_get_nameunit(sc->dev));
4947	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4948	return 0;
4949
4950abort_with_rings:
4951	mxge_free_rings(sc);
4952abort_with_slices:
4953	mxge_free_slices(sc);
4954abort_with_dmabench:
4955	mxge_dma_free(&sc->dmabench_dma);
4956abort_with_zeropad_dma:
4957	mxge_dma_free(&sc->zeropad_dma);
4958abort_with_cmd_dma:
4959	mxge_dma_free(&sc->cmd_dma);
4960abort_with_mem_res:
4961	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4962abort_with_lock:
4963	pci_disable_busmaster(dev);
4964	mtx_destroy(&sc->cmd_mtx);
4965	mtx_destroy(&sc->driver_mtx);
4966	if_free(ifp);
4967abort_with_parent_dmat:
4968	bus_dma_tag_destroy(sc->parent_dmat);
4969abort_with_tq:
4970	if (sc->tq != NULL) {
4971		taskqueue_drain(sc->tq, &sc->watchdog_task);
4972		taskqueue_free(sc->tq);
4973		sc->tq = NULL;
4974	}
4975abort_with_nothing:
4976	return err;
4977}
4978
4979static int
4980mxge_detach(device_t dev)
4981{
4982	mxge_softc_t *sc = device_get_softc(dev);
4983
4984	if (mxge_vlans_active(sc)) {
4985		device_printf(sc->dev,
4986			      "Detach vlans before removing module\n");
4987		return EBUSY;
4988	}
4989	mtx_lock(&sc->driver_mtx);
4990	sc->dying = 1;
4991	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4992		mxge_close(sc, 0);
4993	mtx_unlock(&sc->driver_mtx);
4994	ether_ifdetach(sc->ifp);
4995	if (sc->tq != NULL) {
4996		taskqueue_drain(sc->tq, &sc->watchdog_task);
4997		taskqueue_free(sc->tq);
4998		sc->tq = NULL;
4999	}
5000	callout_drain(&sc->co_hdl);
5001	ifmedia_removeall(&sc->media);
5002	mxge_dummy_rdma(sc, 0);
5003	mxge_rem_sysctls(sc);
5004	mxge_rem_irq(sc);
5005	mxge_free_rings(sc);
5006	mxge_free_slices(sc);
5007	mxge_dma_free(&sc->dmabench_dma);
5008	mxge_dma_free(&sc->zeropad_dma);
5009	mxge_dma_free(&sc->cmd_dma);
5010	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5011	pci_disable_busmaster(dev);
5012	mtx_destroy(&sc->cmd_mtx);
5013	mtx_destroy(&sc->driver_mtx);
5014	if_free(sc->ifp);
5015	bus_dma_tag_destroy(sc->parent_dmat);
5016	return 0;
5017}
5018
5019static int
5020mxge_shutdown(device_t dev)
5021{
5022	return 0;
5023}
5024
5025/*
5026  This file uses Myri10GE driver indentation.
5027
5028  Local Variables:
5029  c-file-style:"linux"
5030  tab-width:8
5031  End:
5032*/
5033