if_mxge.c revision 272091
1/******************************************************************************
2
3Copyright (c) 2006-2013, Myricom Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Myricom Inc, nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/mxge/if_mxge.c 272091 2014-09-25 05:47:33Z glebius $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/linker.h>
36#include <sys/firmware.h>
37#include <sys/endian.h>
38#include <sys/sockio.h>
39#include <sys/mbuf.h>
40#include <sys/malloc.h>
41#include <sys/kdb.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/module.h>
45#include <sys/socket.h>
46#include <sys/sysctl.h>
47#include <sys/sx.h>
48#include <sys/taskqueue.h>
49
50#include <net/if.h>
51#include <net/if_var.h>
52#include <net/if_arp.h>
53#include <net/ethernet.h>
54#include <net/if_dl.h>
55#include <net/if_media.h>
56
57#include <net/bpf.h>
58
59#include <net/if_types.h>
60#include <net/if_vlan_var.h>
61#include <net/zlib.h>
62
63#include <netinet/in_systm.h>
64#include <netinet/in.h>
65#include <netinet/ip.h>
66#include <netinet/ip6.h>
67#include <netinet/tcp.h>
68#include <netinet/tcp_lro.h>
69#include <netinet6/ip6_var.h>
70
71#include <machine/bus.h>
72#include <machine/in_cksum.h>
73#include <machine/resource.h>
74#include <sys/bus.h>
75#include <sys/rman.h>
76#include <sys/smp.h>
77
78#include <dev/pci/pcireg.h>
79#include <dev/pci/pcivar.h>
80#include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
81
82#include <vm/vm.h>		/* for pmap_mapdev() */
83#include <vm/pmap.h>
84
85#if defined(__i386) || defined(__amd64)
86#include <machine/specialreg.h>
87#endif
88
89#include <dev/mxge/mxge_mcp.h>
90#include <dev/mxge/mcp_gen_header.h>
91/*#define MXGE_FAKE_IFP*/
92#include <dev/mxge/if_mxge_var.h>
93#ifdef IFNET_BUF_RING
94#include <sys/buf_ring.h>
95#endif
96
97#include "opt_inet.h"
98#include "opt_inet6.h"
99
100/* tunable params */
101static int mxge_nvidia_ecrc_enable = 1;
102static int mxge_force_firmware = 0;
103static int mxge_intr_coal_delay = 30;
104static int mxge_deassert_wait = 1;
105static int mxge_flow_control = 1;
106static int mxge_verbose = 0;
107static int mxge_ticks;
108static int mxge_max_slices = 1;
109static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
110static int mxge_always_promisc = 0;
111static int mxge_initial_mtu = ETHERMTU_JUMBO;
112static int mxge_throttle = 0;
113static char *mxge_fw_unaligned = "mxge_ethp_z8e";
114static char *mxge_fw_aligned = "mxge_eth_z8e";
115static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
116static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
117
118static int mxge_probe(device_t dev);
119static int mxge_attach(device_t dev);
120static int mxge_detach(device_t dev);
121static int mxge_shutdown(device_t dev);
122static void mxge_intr(void *arg);
123
124static device_method_t mxge_methods[] =
125{
126  /* Device interface */
127  DEVMETHOD(device_probe, mxge_probe),
128  DEVMETHOD(device_attach, mxge_attach),
129  DEVMETHOD(device_detach, mxge_detach),
130  DEVMETHOD(device_shutdown, mxge_shutdown),
131
132  DEVMETHOD_END
133};
134
135static driver_t mxge_driver =
136{
137  "mxge",
138  mxge_methods,
139  sizeof(mxge_softc_t),
140};
141
142static devclass_t mxge_devclass;
143
144/* Declare ourselves to be a child of the PCI bus.*/
145DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
146MODULE_DEPEND(mxge, firmware, 1, 1, 1);
147MODULE_DEPEND(mxge, zlib, 1, 1, 1);
148
149static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
150static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
151static int mxge_close(mxge_softc_t *sc, int down);
152static int mxge_open(mxge_softc_t *sc);
153static void mxge_tick(void *arg);
154
155static int
156mxge_probe(device_t dev)
157{
158	int rev;
159
160
161	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
162	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
163	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
164		rev = pci_get_revid(dev);
165		switch (rev) {
166		case MXGE_PCI_REV_Z8E:
167			device_set_desc(dev, "Myri10G-PCIE-8A");
168			break;
169		case MXGE_PCI_REV_Z8ES:
170			device_set_desc(dev, "Myri10G-PCIE-8B");
171			break;
172		default:
173			device_set_desc(dev, "Myri10G-PCIE-8??");
174			device_printf(dev, "Unrecognized rev %d NIC\n",
175				      rev);
176			break;
177		}
178		return 0;
179	}
180	return ENXIO;
181}
182
183static void
184mxge_enable_wc(mxge_softc_t *sc)
185{
186#if defined(__i386) || defined(__amd64)
187	vm_offset_t len;
188	int err;
189
190	sc->wc = 1;
191	len = rman_get_size(sc->mem_res);
192	err = pmap_change_attr((vm_offset_t) sc->sram,
193			       len, PAT_WRITE_COMBINING);
194	if (err != 0) {
195		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
196			      err);
197		sc->wc = 0;
198	}
199#endif
200}
201
202
203/* callback to get our DMA address */
204static void
205mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
206			 int error)
207{
208	if (error == 0) {
209		*(bus_addr_t *) arg = segs->ds_addr;
210	}
211}
212
213static int
214mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
215		   bus_size_t alignment)
216{
217	int err;
218	device_t dev = sc->dev;
219	bus_size_t boundary, maxsegsize;
220
221	if (bytes > 4096 && alignment == 4096) {
222		boundary = 0;
223		maxsegsize = bytes;
224	} else {
225		boundary = 4096;
226		maxsegsize = 4096;
227	}
228
229	/* allocate DMAable memory tags */
230	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
231				 alignment,		/* alignment */
232				 boundary,		/* boundary */
233				 BUS_SPACE_MAXADDR,	/* low */
234				 BUS_SPACE_MAXADDR,	/* high */
235				 NULL, NULL,		/* filter */
236				 bytes,			/* maxsize */
237				 1,			/* num segs */
238				 maxsegsize,		/* maxsegsize */
239				 BUS_DMA_COHERENT,	/* flags */
240				 NULL, NULL,		/* lock */
241				 &dma->dmat);		/* tag */
242	if (err != 0) {
243		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
244		return err;
245	}
246
247	/* allocate DMAable memory & map */
248	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
249			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
250				| BUS_DMA_ZERO),  &dma->map);
251	if (err != 0) {
252		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
253		goto abort_with_dmat;
254	}
255
256	/* load the memory */
257	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
258			      mxge_dmamap_callback,
259			      (void *)&dma->bus_addr, 0);
260	if (err != 0) {
261		device_printf(dev, "couldn't load map (err = %d)\n", err);
262		goto abort_with_mem;
263	}
264	return 0;
265
266abort_with_mem:
267	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
268abort_with_dmat:
269	(void)bus_dma_tag_destroy(dma->dmat);
270	return err;
271}
272
273
274static void
275mxge_dma_free(mxge_dma_t *dma)
276{
277	bus_dmamap_unload(dma->dmat, dma->map);
278	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
279	(void)bus_dma_tag_destroy(dma->dmat);
280}
281
282/*
283 * The eeprom strings on the lanaiX have the format
284 * SN=x\0
285 * MAC=x:x:x:x:x:x\0
286 * PC=text\0
287 */
288
289static int
290mxge_parse_strings(mxge_softc_t *sc)
291{
292	char *ptr;
293	int i, found_mac, found_sn2;
294	char *endptr;
295
296	ptr = sc->eeprom_strings;
297	found_mac = 0;
298	found_sn2 = 0;
299	while (*ptr != '\0') {
300		if (strncmp(ptr, "MAC=", 4) == 0) {
301			ptr += 4;
302			for (i = 0;;) {
303				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
304				if (endptr - ptr != 2)
305					goto abort;
306				ptr = endptr;
307				if (++i == 6)
308					break;
309				if (*ptr++ != ':')
310					goto abort;
311			}
312			found_mac = 1;
313		} else if (strncmp(ptr, "PC=", 3) == 0) {
314			ptr += 3;
315			strlcpy(sc->product_code_string, ptr,
316			    sizeof(sc->product_code_string));
317		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
318			ptr += 3;
319			strlcpy(sc->serial_number_string, ptr,
320			    sizeof(sc->serial_number_string));
321		} else if (strncmp(ptr, "SN2=", 4) == 0) {
322			/* SN2 takes precedence over SN */
323			ptr += 4;
324			found_sn2 = 1;
325			strlcpy(sc->serial_number_string, ptr,
326			    sizeof(sc->serial_number_string));
327		}
328		while (*ptr++ != '\0') {}
329	}
330
331	if (found_mac)
332		return 0;
333
334 abort:
335	device_printf(sc->dev, "failed to parse eeprom_strings\n");
336
337	return ENXIO;
338}
339
340#if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
341static void
342mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
343{
344	uint32_t val;
345	unsigned long base, off;
346	char *va, *cfgptr;
347	device_t pdev, mcp55;
348	uint16_t vendor_id, device_id, word;
349	uintptr_t bus, slot, func, ivend, idev;
350	uint32_t *ptr32;
351
352
353	if (!mxge_nvidia_ecrc_enable)
354		return;
355
356	pdev = device_get_parent(device_get_parent(sc->dev));
357	if (pdev == NULL) {
358		device_printf(sc->dev, "could not find parent?\n");
359		return;
360	}
361	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
362	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
363
364	if (vendor_id != 0x10de)
365		return;
366
367	base = 0;
368
369	if (device_id == 0x005d) {
370		/* ck804, base address is magic */
371		base = 0xe0000000UL;
372	} else if (device_id >= 0x0374 && device_id <= 0x378) {
373		/* mcp55, base address stored in chipset */
374		mcp55 = pci_find_bsf(0, 0, 0);
375		if (mcp55 &&
376		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
377		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
378			word = pci_read_config(mcp55, 0x90, 2);
379			base = ((unsigned long)word & 0x7ffeU) << 25;
380		}
381	}
382	if (!base)
383		return;
384
385	/* XXXX
386	   Test below is commented because it is believed that doing
387	   config read/write beyond 0xff will access the config space
388	   for the next larger function.  Uncomment this and remove
389	   the hacky pmap_mapdev() way of accessing config space when
390	   FreeBSD grows support for extended pcie config space access
391	*/
392#if 0
393	/* See if we can, by some miracle, access the extended
394	   config space */
395	val = pci_read_config(pdev, 0x178, 4);
396	if (val != 0xffffffff) {
397		val |= 0x40;
398		pci_write_config(pdev, 0x178, val, 4);
399		return;
400	}
401#endif
402	/* Rather than using normal pci config space writes, we must
403	 * map the Nvidia config space ourselves.  This is because on
404	 * opteron/nvidia class machine the 0xe000000 mapping is
405	 * handled by the nvidia chipset, that means the internal PCI
406	 * device (the on-chip northbridge), or the amd-8131 bridge
407	 * and things behind them are not visible by this method.
408	 */
409
410	BUS_READ_IVAR(device_get_parent(pdev), pdev,
411		      PCI_IVAR_BUS, &bus);
412	BUS_READ_IVAR(device_get_parent(pdev), pdev,
413		      PCI_IVAR_SLOT, &slot);
414	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415		      PCI_IVAR_FUNCTION, &func);
416	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417		      PCI_IVAR_VENDOR, &ivend);
418	BUS_READ_IVAR(device_get_parent(pdev), pdev,
419		      PCI_IVAR_DEVICE, &idev);
420
421	off =  base
422		+ 0x00100000UL * (unsigned long)bus
423		+ 0x00001000UL * (unsigned long)(func
424						 + 8 * slot);
425
426	/* map it into the kernel */
427	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
428
429
430	if (va == NULL) {
431		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
432		return;
433	}
434	/* get a pointer to the config space mapped into the kernel */
435	cfgptr = va + (off & PAGE_MASK);
436
437	/* make sure that we can really access it */
438	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
439	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
440	if (! (vendor_id == ivend && device_id == idev)) {
441		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
442			      vendor_id, device_id);
443		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
444		return;
445	}
446
447	ptr32 = (uint32_t*)(cfgptr + 0x178);
448	val = *ptr32;
449
450	if (val == 0xffffffff) {
451		device_printf(sc->dev, "extended mapping failed\n");
452		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
453		return;
454	}
455	*ptr32 = val | 0x40;
456	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
457	if (mxge_verbose)
458		device_printf(sc->dev,
459			      "Enabled ECRC on upstream Nvidia bridge "
460			      "at %d:%d:%d\n",
461			      (int)bus, (int)slot, (int)func);
462	return;
463}
464#else
465static void
466mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
467{
468	device_printf(sc->dev,
469		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
470	return;
471}
472#endif
473
474
475static int
476mxge_dma_test(mxge_softc_t *sc, int test_type)
477{
478	mxge_cmd_t cmd;
479	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
480	int status;
481	uint32_t len;
482	char *test = " ";
483
484
485	/* Run a small DMA test.
486	 * The magic multipliers to the length tell the firmware
487	 * to do DMA read, write, or read+write tests.  The
488	 * results are returned in cmd.data0.  The upper 16
489	 * bits of the return is the number of transfers completed.
490	 * The lower 16 bits is the time in 0.5us ticks that the
491	 * transfers took to complete.
492	 */
493
494	len = sc->tx_boundary;
495
496	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
497	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
498	cmd.data2 = len * 0x10000;
499	status = mxge_send_cmd(sc, test_type, &cmd);
500	if (status != 0) {
501		test = "read";
502		goto abort;
503	}
504	sc->read_dma = ((cmd.data0>>16) * len * 2) /
505		(cmd.data0 & 0xffff);
506	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508	cmd.data2 = len * 0x1;
509	status = mxge_send_cmd(sc, test_type, &cmd);
510	if (status != 0) {
511		test = "write";
512		goto abort;
513	}
514	sc->write_dma = ((cmd.data0>>16) * len * 2) /
515		(cmd.data0 & 0xffff);
516
517	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
518	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
519	cmd.data2 = len * 0x10001;
520	status = mxge_send_cmd(sc, test_type, &cmd);
521	if (status != 0) {
522		test = "read/write";
523		goto abort;
524	}
525	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
526		(cmd.data0 & 0xffff);
527
528abort:
529	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
530		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
531			      test, status);
532
533	return status;
534}
535
536/*
537 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
538 * when the PCI-E Completion packets are aligned on an 8-byte
539 * boundary.  Some PCI-E chip sets always align Completion packets; on
540 * the ones that do not, the alignment can be enforced by enabling
541 * ECRC generation (if supported).
542 *
543 * When PCI-E Completion packets are not aligned, it is actually more
544 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
545 *
546 * If the driver can neither enable ECRC nor verify that it has
547 * already been enabled, then it must use a firmware image which works
548 * around unaligned completion packets (ethp_z8e.dat), and it should
549 * also ensure that it never gives the device a Read-DMA which is
550 * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
551 * enabled, then the driver should use the aligned (eth_z8e.dat)
552 * firmware image, and set tx_boundary to 4KB.
553 */
554
555static int
556mxge_firmware_probe(mxge_softc_t *sc)
557{
558	device_t dev = sc->dev;
559	int reg, status;
560	uint16_t pectl;
561
562	sc->tx_boundary = 4096;
563	/*
564	 * Verify the max read request size was set to 4KB
565	 * before trying the test with 4KB.
566	 */
567	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
568		pectl = pci_read_config(dev, reg + 0x8, 2);
569		if ((pectl & (5 << 12)) != (5 << 12)) {
570			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
571				      pectl);
572			sc->tx_boundary = 2048;
573		}
574	}
575
576	/*
577	 * load the optimized firmware (which assumes aligned PCIe
578	 * completions) in order to see if it works on this host.
579	 */
580	sc->fw_name = mxge_fw_aligned;
581	status = mxge_load_firmware(sc, 1);
582	if (status != 0) {
583		return status;
584	}
585
586	/*
587	 * Enable ECRC if possible
588	 */
589	mxge_enable_nvidia_ecrc(sc);
590
591	/*
592	 * Run a DMA test which watches for unaligned completions and
593	 * aborts on the first one seen.  Not required on Z8ES or newer.
594	 */
595	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
596		return 0;
597	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
598	if (status == 0)
599		return 0; /* keep the aligned firmware */
600
601	if (status != E2BIG)
602		device_printf(dev, "DMA test failed: %d\n", status);
603	if (status == ENOSYS)
604		device_printf(dev, "Falling back to ethp! "
605			      "Please install up to date fw\n");
606	return status;
607}
608
609static int
610mxge_select_firmware(mxge_softc_t *sc)
611{
612	int aligned = 0;
613	int force_firmware = mxge_force_firmware;
614
615	if (sc->throttle)
616		force_firmware = sc->throttle;
617
618	if (force_firmware != 0) {
619		if (force_firmware == 1)
620			aligned = 1;
621		else
622			aligned = 0;
623		if (mxge_verbose)
624			device_printf(sc->dev,
625				      "Assuming %s completions (forced)\n",
626				      aligned ? "aligned" : "unaligned");
627		goto abort;
628	}
629
630	/* if the PCIe link width is 4 or less, we can use the aligned
631	   firmware and skip any checks */
632	if (sc->link_width != 0 && sc->link_width <= 4) {
633		device_printf(sc->dev,
634			      "PCIe x%d Link, expect reduced performance\n",
635			      sc->link_width);
636		aligned = 1;
637		goto abort;
638	}
639
640	if (0 == mxge_firmware_probe(sc))
641		return 0;
642
643abort:
644	if (aligned) {
645		sc->fw_name = mxge_fw_aligned;
646		sc->tx_boundary = 4096;
647	} else {
648		sc->fw_name = mxge_fw_unaligned;
649		sc->tx_boundary = 2048;
650	}
651	return (mxge_load_firmware(sc, 0));
652}
653
654static int
655mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
656{
657
658
659	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
660		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
661			      be32toh(hdr->mcp_type));
662		return EIO;
663	}
664
665	/* save firmware version for sysctl */
666	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
667	if (mxge_verbose)
668		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
669
670	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
671	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
672
673	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
674	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
675		device_printf(sc->dev, "Found firmware version %s\n",
676			      sc->fw_version);
677		device_printf(sc->dev, "Driver needs %d.%d\n",
678			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
679		return EINVAL;
680	}
681	return 0;
682
683}
684
685static void *
686z_alloc(void *nil, u_int items, u_int size)
687{
688	void *ptr;
689
690	ptr = malloc(items * size, M_TEMP, M_NOWAIT);
691	return ptr;
692}
693
694static void
695z_free(void *nil, void *ptr)
696{
697	free(ptr, M_TEMP);
698}
699
700
701static int
702mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
703{
704	z_stream zs;
705	char *inflate_buffer;
706	const struct firmware *fw;
707	const mcp_gen_header_t *hdr;
708	unsigned hdr_offset;
709	int status;
710	unsigned int i;
711	char dummy;
712	size_t fw_len;
713
714	fw = firmware_get(sc->fw_name);
715	if (fw == NULL) {
716		device_printf(sc->dev, "Could not find firmware image %s\n",
717			      sc->fw_name);
718		return ENOENT;
719	}
720
721
722
723	/* setup zlib and decompress f/w */
724	bzero(&zs, sizeof (zs));
725	zs.zalloc = z_alloc;
726	zs.zfree = z_free;
727	status = inflateInit(&zs);
728	if (status != Z_OK) {
729		status = EIO;
730		goto abort_with_fw;
731	}
732
733	/* the uncompressed size is stored as the firmware version,
734	   which would otherwise go unused */
735	fw_len = (size_t) fw->version;
736	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
737	if (inflate_buffer == NULL)
738		goto abort_with_zs;
739	zs.avail_in = fw->datasize;
740	zs.next_in = __DECONST(char *, fw->data);
741	zs.avail_out = fw_len;
742	zs.next_out = inflate_buffer;
743	status = inflate(&zs, Z_FINISH);
744	if (status != Z_STREAM_END) {
745		device_printf(sc->dev, "zlib %d\n", status);
746		status = EIO;
747		goto abort_with_buffer;
748	}
749
750	/* check id */
751	hdr_offset = htobe32(*(const uint32_t *)
752			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
753	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
754		device_printf(sc->dev, "Bad firmware file");
755		status = EIO;
756		goto abort_with_buffer;
757	}
758	hdr = (const void*)(inflate_buffer + hdr_offset);
759
760	status = mxge_validate_firmware(sc, hdr);
761	if (status != 0)
762		goto abort_with_buffer;
763
764	/* Copy the inflated firmware to NIC SRAM. */
765	for (i = 0; i < fw_len; i += 256) {
766		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
767			      inflate_buffer + i,
768			      min(256U, (unsigned)(fw_len - i)));
769		wmb();
770		dummy = *sc->sram;
771		wmb();
772	}
773
774	*limit = fw_len;
775	status = 0;
776abort_with_buffer:
777	free(inflate_buffer, M_TEMP);
778abort_with_zs:
779	inflateEnd(&zs);
780abort_with_fw:
781	firmware_put(fw, FIRMWARE_UNLOAD);
782	return status;
783}
784
785/*
786 * Enable or disable periodic RDMAs from the host to make certain
787 * chipsets resend dropped PCIe messages
788 */
789
790static void
791mxge_dummy_rdma(mxge_softc_t *sc, int enable)
792{
793	char buf_bytes[72];
794	volatile uint32_t *confirm;
795	volatile char *submit;
796	uint32_t *buf, dma_low, dma_high;
797	int i;
798
799	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
800
801	/* clear confirmation addr */
802	confirm = (volatile uint32_t *)sc->cmd;
803	*confirm = 0;
804	wmb();
805
806	/* send an rdma command to the PCIe engine, and wait for the
807	   response in the confirmation address.  The firmware should
808	   write a -1 there to indicate it is alive and well
809	*/
810
811	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
812	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
813	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
814	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
815	buf[2] = htobe32(0xffffffff);		/* confirm data */
816	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
817	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
818	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
819	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
820	buf[5] = htobe32(enable);			/* enable? */
821
822
823	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
824
825	mxge_pio_copy(submit, buf, 64);
826	wmb();
827	DELAY(1000);
828	wmb();
829	i = 0;
830	while (*confirm != 0xffffffff && i < 20) {
831		DELAY(1000);
832		i++;
833	}
834	if (*confirm != 0xffffffff) {
835		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
836			      (enable ? "enable" : "disable"), confirm,
837			      *confirm);
838	}
839	return;
840}
841
842static int
843mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
844{
845	mcp_cmd_t *buf;
846	char buf_bytes[sizeof(*buf) + 8];
847	volatile mcp_cmd_response_t *response = sc->cmd;
848	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
849	uint32_t dma_low, dma_high;
850	int err, sleep_total = 0;
851
852	/* ensure buf is aligned to 8 bytes */
853	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
854
855	buf->data0 = htobe32(data->data0);
856	buf->data1 = htobe32(data->data1);
857	buf->data2 = htobe32(data->data2);
858	buf->cmd = htobe32(cmd);
859	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
860	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
861
862	buf->response_addr.low = htobe32(dma_low);
863	buf->response_addr.high = htobe32(dma_high);
864	mtx_lock(&sc->cmd_mtx);
865	response->result = 0xffffffff;
866	wmb();
867	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
868
869	/* wait up to 20ms */
870	err = EAGAIN;
871	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
872		bus_dmamap_sync(sc->cmd_dma.dmat,
873				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
874		wmb();
875		switch (be32toh(response->result)) {
876		case 0:
877			data->data0 = be32toh(response->data);
878			err = 0;
879			break;
880		case 0xffffffff:
881			DELAY(1000);
882			break;
883		case MXGEFW_CMD_UNKNOWN:
884			err = ENOSYS;
885			break;
886		case MXGEFW_CMD_ERROR_UNALIGNED:
887			err = E2BIG;
888			break;
889		case MXGEFW_CMD_ERROR_BUSY:
890			err = EBUSY;
891			break;
892		case MXGEFW_CMD_ERROR_I2C_ABSENT:
893			err = ENXIO;
894			break;
895		default:
896			device_printf(sc->dev,
897				      "mxge: command %d "
898				      "failed, result = %d\n",
899				      cmd, be32toh(response->result));
900			err = ENXIO;
901			break;
902		}
903		if (err != EAGAIN)
904			break;
905	}
906	if (err == EAGAIN)
907		device_printf(sc->dev, "mxge: command %d timed out"
908			      "result = %d\n",
909			      cmd, be32toh(response->result));
910	mtx_unlock(&sc->cmd_mtx);
911	return err;
912}
913
914static int
915mxge_adopt_running_firmware(mxge_softc_t *sc)
916{
917	struct mcp_gen_header *hdr;
918	const size_t bytes = sizeof (struct mcp_gen_header);
919	size_t hdr_offset;
920	int status;
921
922	/* find running firmware header */
923	hdr_offset = htobe32(*(volatile uint32_t *)
924			     (sc->sram + MCP_HEADER_PTR_OFFSET));
925
926	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
927		device_printf(sc->dev,
928			      "Running firmware has bad header offset (%d)\n",
929			      (int)hdr_offset);
930		return EIO;
931	}
932
933	/* copy header of running firmware from SRAM to host memory to
934	 * validate firmware */
935	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
936	if (hdr == NULL) {
937		device_printf(sc->dev, "could not malloc firmware hdr\n");
938		return ENOMEM;
939	}
940	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
941				rman_get_bushandle(sc->mem_res),
942				hdr_offset, (char *)hdr, bytes);
943	status = mxge_validate_firmware(sc, hdr);
944	free(hdr, M_DEVBUF);
945
946	/*
947	 * check to see if adopted firmware has bug where adopting
948	 * it will cause broadcasts to be filtered unless the NIC
949	 * is kept in ALLMULTI mode
950	 */
951	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
952	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
953		sc->adopted_rx_filter_bug = 1;
954		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
955			      "working around rx filter bug\n",
956			      sc->fw_ver_major, sc->fw_ver_minor,
957			      sc->fw_ver_tiny);
958	}
959
960	return status;
961}
962
963
964static int
965mxge_load_firmware(mxge_softc_t *sc, int adopt)
966{
967	volatile uint32_t *confirm;
968	volatile char *submit;
969	char buf_bytes[72];
970	uint32_t *buf, size, dma_low, dma_high;
971	int status, i;
972
973	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
974
975	size = sc->sram_size;
976	status = mxge_load_firmware_helper(sc, &size);
977	if (status) {
978		if (!adopt)
979			return status;
980		/* Try to use the currently running firmware, if
981		   it is new enough */
982		status = mxge_adopt_running_firmware(sc);
983		if (status) {
984			device_printf(sc->dev,
985				      "failed to adopt running firmware\n");
986			return status;
987		}
988		device_printf(sc->dev,
989			      "Successfully adopted running firmware\n");
990		if (sc->tx_boundary == 4096) {
991			device_printf(sc->dev,
992				"Using firmware currently running on NIC"
993				 ".  For optimal\n");
994			device_printf(sc->dev,
995				 "performance consider loading optimized "
996				 "firmware\n");
997		}
998		sc->fw_name = mxge_fw_unaligned;
999		sc->tx_boundary = 2048;
1000		return 0;
1001	}
1002	/* clear confirmation addr */
1003	confirm = (volatile uint32_t *)sc->cmd;
1004	*confirm = 0;
1005	wmb();
1006	/* send a reload command to the bootstrap MCP, and wait for the
1007	   response in the confirmation address.  The firmware should
1008	   write a -1 there to indicate it is alive and well
1009	*/
1010
1011	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1012	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1013
1014	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1015	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1016	buf[2] = htobe32(0xffffffff);	/* confirm data */
1017
1018	/* FIX: All newest firmware should un-protect the bottom of
1019	   the sram before handoff. However, the very first interfaces
1020	   do not. Therefore the handoff copy must skip the first 8 bytes
1021	*/
1022					/* where the code starts*/
1023	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1024	buf[4] = htobe32(size - 8); 	/* length of code */
1025	buf[5] = htobe32(8);		/* where to copy to */
1026	buf[6] = htobe32(0);		/* where to jump to */
1027
1028	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1029	mxge_pio_copy(submit, buf, 64);
1030	wmb();
1031	DELAY(1000);
1032	wmb();
1033	i = 0;
1034	while (*confirm != 0xffffffff && i < 20) {
1035		DELAY(1000*10);
1036		i++;
1037		bus_dmamap_sync(sc->cmd_dma.dmat,
1038				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1039	}
1040	if (*confirm != 0xffffffff) {
1041		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1042			confirm, *confirm);
1043
1044		return ENXIO;
1045	}
1046	return 0;
1047}
1048
1049static int
1050mxge_update_mac_address(mxge_softc_t *sc)
1051{
1052	mxge_cmd_t cmd;
1053	uint8_t *addr = sc->mac_addr;
1054	int status;
1055
1056
1057	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1058		     | (addr[2] << 8) | addr[3]);
1059
1060	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1061
1062	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1063	return status;
1064}
1065
1066static int
1067mxge_change_pause(mxge_softc_t *sc, int pause)
1068{
1069	mxge_cmd_t cmd;
1070	int status;
1071
1072	if (pause)
1073		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1074				       &cmd);
1075	else
1076		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1077				       &cmd);
1078
1079	if (status) {
1080		device_printf(sc->dev, "Failed to set flow control mode\n");
1081		return ENXIO;
1082	}
1083	sc->pause = pause;
1084	return 0;
1085}
1086
1087static void
1088mxge_change_promisc(mxge_softc_t *sc, int promisc)
1089{
1090	mxge_cmd_t cmd;
1091	int status;
1092
1093	if (mxge_always_promisc)
1094		promisc = 1;
1095
1096	if (promisc)
1097		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1098				       &cmd);
1099	else
1100		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1101				       &cmd);
1102
1103	if (status) {
1104		device_printf(sc->dev, "Failed to set promisc mode\n");
1105	}
1106}
1107
1108static void
1109mxge_set_multicast_list(mxge_softc_t *sc)
1110{
1111	mxge_cmd_t cmd;
1112	struct ifmultiaddr *ifma;
1113	struct ifnet *ifp = sc->ifp;
1114	int err;
1115
1116	/* This firmware is known to not support multicast */
1117	if (!sc->fw_multicast_support)
1118		return;
1119
1120	/* Disable multicast filtering while we play with the lists*/
1121	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1122	if (err != 0) {
1123		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1124		       " error status: %d\n", err);
1125		return;
1126	}
1127
1128	if (sc->adopted_rx_filter_bug)
1129		return;
1130
1131	if (ifp->if_flags & IFF_ALLMULTI)
1132		/* request to disable multicast filtering, so quit here */
1133		return;
1134
1135	/* Flush all the filters */
1136
1137	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1138	if (err != 0) {
1139		device_printf(sc->dev,
1140			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1141			      ", error status: %d\n", err);
1142		return;
1143	}
1144
1145	/* Walk the multicast list, and add each address */
1146
1147	if_maddr_rlock(ifp);
1148	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1149		if (ifma->ifma_addr->sa_family != AF_LINK)
1150			continue;
1151		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1152		      &cmd.data0, 4);
1153		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1154		      &cmd.data1, 2);
1155		cmd.data0 = htonl(cmd.data0);
1156		cmd.data1 = htonl(cmd.data1);
1157		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1158		if (err != 0) {
1159			device_printf(sc->dev, "Failed "
1160			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1161			       "%d\t", err);
1162			/* abort, leaving multicast filtering off */
1163			if_maddr_runlock(ifp);
1164			return;
1165		}
1166	}
1167	if_maddr_runlock(ifp);
1168	/* Enable multicast filtering */
1169	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1170	if (err != 0) {
1171		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1172		       ", error status: %d\n", err);
1173	}
1174}
1175
1176static int
1177mxge_max_mtu(mxge_softc_t *sc)
1178{
1179	mxge_cmd_t cmd;
1180	int status;
1181
1182	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1183		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1184
1185	/* try to set nbufs to see if it we can
1186	   use virtually contiguous jumbos */
1187	cmd.data0 = 0;
1188	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1189			       &cmd);
1190	if (status == 0)
1191		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1192
1193	/* otherwise, we're limited to MJUMPAGESIZE */
1194	return MJUMPAGESIZE - MXGEFW_PAD;
1195}
1196
1197static int
1198mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1199{
1200	struct mxge_slice_state *ss;
1201	mxge_rx_done_t *rx_done;
1202	volatile uint32_t *irq_claim;
1203	mxge_cmd_t cmd;
1204	int slice, status;
1205
1206	/* try to send a reset command to the card to see if it
1207	   is alive */
1208	memset(&cmd, 0, sizeof (cmd));
1209	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1210	if (status != 0) {
1211		device_printf(sc->dev, "failed reset\n");
1212		return ENXIO;
1213	}
1214
1215	mxge_dummy_rdma(sc, 1);
1216
1217
1218	/* set the intrq size */
1219	cmd.data0 = sc->rx_ring_size;
1220	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1221
1222	/*
1223	 * Even though we already know how many slices are supported
1224	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1225	 * has magic side effects, and must be called after a reset.
1226	 * It must be called prior to calling any RSS related cmds,
1227	 * including assigning an interrupt queue for anything but
1228	 * slice 0.  It must also be called *after*
1229	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1230	 * the firmware to compute offsets.
1231	 */
1232
1233	if (sc->num_slices > 1) {
1234		/* ask the maximum number of slices it supports */
1235		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1236					   &cmd);
1237		if (status != 0) {
1238			device_printf(sc->dev,
1239				      "failed to get number of slices\n");
1240			return status;
1241		}
1242		/*
1243		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1244		 * to setting up the interrupt queue DMA
1245		 */
1246		cmd.data0 = sc->num_slices;
1247		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1248#ifdef IFNET_BUF_RING
1249		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1250#endif
1251		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1252					   &cmd);
1253		if (status != 0) {
1254			device_printf(sc->dev,
1255				      "failed to set number of slices\n");
1256			return status;
1257		}
1258	}
1259
1260
1261	if (interrupts_setup) {
1262		/* Now exchange information about interrupts  */
1263		for (slice = 0; slice < sc->num_slices; slice++) {
1264			rx_done = &sc->ss[slice].rx_done;
1265			memset(rx_done->entry, 0, sc->rx_ring_size);
1266			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1267			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1268			cmd.data2 = slice;
1269			status |= mxge_send_cmd(sc,
1270						MXGEFW_CMD_SET_INTRQ_DMA,
1271						&cmd);
1272		}
1273	}
1274
1275	status |= mxge_send_cmd(sc,
1276				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1277
1278
1279	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1280
1281	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1282	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1283
1284
1285	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1286				&cmd);
1287	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1288	if (status != 0) {
1289		device_printf(sc->dev, "failed set interrupt parameters\n");
1290		return status;
1291	}
1292
1293
1294	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1295
1296
1297	/* run a DMA benchmark */
1298	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1299
1300	for (slice = 0; slice < sc->num_slices; slice++) {
1301		ss = &sc->ss[slice];
1302
1303		ss->irq_claim = irq_claim + (2 * slice);
1304		/* reset mcp/driver shared state back to 0 */
1305		ss->rx_done.idx = 0;
1306		ss->rx_done.cnt = 0;
1307		ss->tx.req = 0;
1308		ss->tx.done = 0;
1309		ss->tx.pkt_done = 0;
1310		ss->tx.queue_active = 0;
1311		ss->tx.activate = 0;
1312		ss->tx.deactivate = 0;
1313		ss->tx.wake = 0;
1314		ss->tx.defrag = 0;
1315		ss->tx.stall = 0;
1316		ss->rx_big.cnt = 0;
1317		ss->rx_small.cnt = 0;
1318		ss->lc.lro_bad_csum = 0;
1319		ss->lc.lro_queued = 0;
1320		ss->lc.lro_flushed = 0;
1321		if (ss->fw_stats != NULL) {
1322			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1323		}
1324	}
1325	sc->rdma_tags_available = 15;
1326	status = mxge_update_mac_address(sc);
1327	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1328	mxge_change_pause(sc, sc->pause);
1329	mxge_set_multicast_list(sc);
1330	if (sc->throttle) {
1331		cmd.data0 = sc->throttle;
1332		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1333				  &cmd)) {
1334			device_printf(sc->dev,
1335				      "can't enable throttle\n");
1336		}
1337	}
1338	return status;
1339}
1340
1341static int
1342mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1343{
1344	mxge_cmd_t cmd;
1345	mxge_softc_t *sc;
1346	int err;
1347	unsigned int throttle;
1348
1349	sc = arg1;
1350	throttle = sc->throttle;
1351	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1352	if (err != 0) {
1353		return err;
1354	}
1355
1356	if (throttle == sc->throttle)
1357		return 0;
1358
1359	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1360		return EINVAL;
1361
1362	mtx_lock(&sc->driver_mtx);
1363	cmd.data0 = throttle;
1364	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1365	if (err == 0)
1366		sc->throttle = throttle;
1367	mtx_unlock(&sc->driver_mtx);
1368	return err;
1369}
1370
1371static int
1372mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1373{
1374	mxge_softc_t *sc;
1375	unsigned int intr_coal_delay;
1376	int err;
1377
1378	sc = arg1;
1379	intr_coal_delay = sc->intr_coal_delay;
1380	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1381	if (err != 0) {
1382		return err;
1383	}
1384	if (intr_coal_delay == sc->intr_coal_delay)
1385		return 0;
1386
1387	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1388		return EINVAL;
1389
1390	mtx_lock(&sc->driver_mtx);
1391	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1392	sc->intr_coal_delay = intr_coal_delay;
1393
1394	mtx_unlock(&sc->driver_mtx);
1395	return err;
1396}
1397
1398static int
1399mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1400{
1401	mxge_softc_t *sc;
1402	unsigned int enabled;
1403	int err;
1404
1405	sc = arg1;
1406	enabled = sc->pause;
1407	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1408	if (err != 0) {
1409		return err;
1410	}
1411	if (enabled == sc->pause)
1412		return 0;
1413
1414	mtx_lock(&sc->driver_mtx);
1415	err = mxge_change_pause(sc, enabled);
1416	mtx_unlock(&sc->driver_mtx);
1417	return err;
1418}
1419
1420static int
1421mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1422{
1423	int err;
1424
1425	if (arg1 == NULL)
1426		return EFAULT;
1427	arg2 = be32toh(*(int *)arg1);
1428	arg1 = NULL;
1429	err = sysctl_handle_int(oidp, arg1, arg2, req);
1430
1431	return err;
1432}
1433
1434static void
1435mxge_rem_sysctls(mxge_softc_t *sc)
1436{
1437	struct mxge_slice_state *ss;
1438	int slice;
1439
1440	if (sc->slice_sysctl_tree == NULL)
1441		return;
1442
1443	for (slice = 0; slice < sc->num_slices; slice++) {
1444		ss = &sc->ss[slice];
1445		if (ss == NULL || ss->sysctl_tree == NULL)
1446			continue;
1447		sysctl_ctx_free(&ss->sysctl_ctx);
1448		ss->sysctl_tree = NULL;
1449	}
1450	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1451	sc->slice_sysctl_tree = NULL;
1452}
1453
1454static void
1455mxge_add_sysctls(mxge_softc_t *sc)
1456{
1457	struct sysctl_ctx_list *ctx;
1458	struct sysctl_oid_list *children;
1459	mcp_irq_data_t *fw;
1460	struct mxge_slice_state *ss;
1461	int slice;
1462	char slice_num[8];
1463
1464	ctx = device_get_sysctl_ctx(sc->dev);
1465	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1466	fw = sc->ss[0].fw_stats;
1467
1468	/* random information */
1469	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1470		       "firmware_version",
1471		       CTLFLAG_RD, &sc->fw_version,
1472		       0, "firmware version");
1473	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1474		       "serial_number",
1475		       CTLFLAG_RD, &sc->serial_number_string,
1476		       0, "serial number");
1477	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1478		       "product_code",
1479		       CTLFLAG_RD, &sc->product_code_string,
1480		       0, "product_code");
1481	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1482		       "pcie_link_width",
1483		       CTLFLAG_RD, &sc->link_width,
1484		       0, "tx_boundary");
1485	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1486		       "tx_boundary",
1487		       CTLFLAG_RD, &sc->tx_boundary,
1488		       0, "tx_boundary");
1489	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1490		       "write_combine",
1491		       CTLFLAG_RD, &sc->wc,
1492		       0, "write combining PIO?");
1493	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1494		       "read_dma_MBs",
1495		       CTLFLAG_RD, &sc->read_dma,
1496		       0, "DMA Read speed in MB/s");
1497	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1498		       "write_dma_MBs",
1499		       CTLFLAG_RD, &sc->write_dma,
1500		       0, "DMA Write speed in MB/s");
1501	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1502		       "read_write_dma_MBs",
1503		       CTLFLAG_RD, &sc->read_write_dma,
1504		       0, "DMA concurrent Read/Write speed in MB/s");
1505	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1506		       "watchdog_resets",
1507		       CTLFLAG_RD, &sc->watchdog_resets,
1508		       0, "Number of times NIC was reset");
1509
1510
1511	/* performance related tunables */
1512	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1513			"intr_coal_delay",
1514			CTLTYPE_INT|CTLFLAG_RW, sc,
1515			0, mxge_change_intr_coal,
1516			"I", "interrupt coalescing delay in usecs");
1517
1518	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519			"throttle",
1520			CTLTYPE_INT|CTLFLAG_RW, sc,
1521			0, mxge_change_throttle,
1522			"I", "transmit throttling");
1523
1524	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1525			"flow_control_enabled",
1526			CTLTYPE_INT|CTLFLAG_RW, sc,
1527			0, mxge_change_flow_control,
1528			"I", "interrupt coalescing delay in usecs");
1529
1530	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1531		       "deassert_wait",
1532		       CTLFLAG_RW, &mxge_deassert_wait,
1533		       0, "Wait for IRQ line to go low in ihandler");
1534
1535	/* stats block from firmware is in network byte order.
1536	   Need to swap it */
1537	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1538			"link_up",
1539			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1540			0, mxge_handle_be32,
1541			"I", "link up");
1542	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543			"rdma_tags_available",
1544			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1545			0, mxge_handle_be32,
1546			"I", "rdma_tags_available");
1547	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548			"dropped_bad_crc32",
1549			CTLTYPE_INT|CTLFLAG_RD,
1550			&fw->dropped_bad_crc32,
1551			0, mxge_handle_be32,
1552			"I", "dropped_bad_crc32");
1553	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554			"dropped_bad_phy",
1555			CTLTYPE_INT|CTLFLAG_RD,
1556			&fw->dropped_bad_phy,
1557			0, mxge_handle_be32,
1558			"I", "dropped_bad_phy");
1559	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560			"dropped_link_error_or_filtered",
1561			CTLTYPE_INT|CTLFLAG_RD,
1562			&fw->dropped_link_error_or_filtered,
1563			0, mxge_handle_be32,
1564			"I", "dropped_link_error_or_filtered");
1565	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1566			"dropped_link_overflow",
1567			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1568			0, mxge_handle_be32,
1569			"I", "dropped_link_overflow");
1570	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571			"dropped_multicast_filtered",
1572			CTLTYPE_INT|CTLFLAG_RD,
1573			&fw->dropped_multicast_filtered,
1574			0, mxge_handle_be32,
1575			"I", "dropped_multicast_filtered");
1576	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577			"dropped_no_big_buffer",
1578			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1579			0, mxge_handle_be32,
1580			"I", "dropped_no_big_buffer");
1581	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1582			"dropped_no_small_buffer",
1583			CTLTYPE_INT|CTLFLAG_RD,
1584			&fw->dropped_no_small_buffer,
1585			0, mxge_handle_be32,
1586			"I", "dropped_no_small_buffer");
1587	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588			"dropped_overrun",
1589			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1590			0, mxge_handle_be32,
1591			"I", "dropped_overrun");
1592	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1593			"dropped_pause",
1594			CTLTYPE_INT|CTLFLAG_RD,
1595			&fw->dropped_pause,
1596			0, mxge_handle_be32,
1597			"I", "dropped_pause");
1598	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1599			"dropped_runt",
1600			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1601			0, mxge_handle_be32,
1602			"I", "dropped_runt");
1603
1604	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1605			"dropped_unicast_filtered",
1606			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1607			0, mxge_handle_be32,
1608			"I", "dropped_unicast_filtered");
1609
1610	/* verbose printing? */
1611	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1612		       "verbose",
1613		       CTLFLAG_RW, &mxge_verbose,
1614		       0, "verbose printing");
1615
1616	/* add counters exported for debugging from all slices */
1617	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1618	sc->slice_sysctl_tree =
1619		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1620				"slice", CTLFLAG_RD, 0, "");
1621
1622	for (slice = 0; slice < sc->num_slices; slice++) {
1623		ss = &sc->ss[slice];
1624		sysctl_ctx_init(&ss->sysctl_ctx);
1625		ctx = &ss->sysctl_ctx;
1626		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1627		sprintf(slice_num, "%d", slice);
1628		ss->sysctl_tree =
1629			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1630					CTLFLAG_RD, 0, "");
1631		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1632		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1633			       "rx_small_cnt",
1634			       CTLFLAG_RD, &ss->rx_small.cnt,
1635			       0, "rx_small_cnt");
1636		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637			       "rx_big_cnt",
1638			       CTLFLAG_RD, &ss->rx_big.cnt,
1639			       0, "rx_small_cnt");
1640		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1641			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1642			       0, "number of lro merge queues flushed");
1643
1644		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1645			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1646			       0, "number of bad csums preventing LRO");
1647
1648		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1649			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1650			       0, "number of frames appended to lro merge"
1651			       "queues");
1652
1653#ifndef IFNET_BUF_RING
1654		/* only transmit from slice 0 for now */
1655		if (slice > 0)
1656			continue;
1657#endif
1658		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1659			       "tx_req",
1660			       CTLFLAG_RD, &ss->tx.req,
1661			       0, "tx_req");
1662
1663		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1664			       "tx_done",
1665			       CTLFLAG_RD, &ss->tx.done,
1666			       0, "tx_done");
1667		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1668			       "tx_pkt_done",
1669			       CTLFLAG_RD, &ss->tx.pkt_done,
1670			       0, "tx_done");
1671		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1672			       "tx_stall",
1673			       CTLFLAG_RD, &ss->tx.stall,
1674			       0, "tx_stall");
1675		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676			       "tx_wake",
1677			       CTLFLAG_RD, &ss->tx.wake,
1678			       0, "tx_wake");
1679		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1680			       "tx_defrag",
1681			       CTLFLAG_RD, &ss->tx.defrag,
1682			       0, "tx_defrag");
1683		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684			       "tx_queue_active",
1685			       CTLFLAG_RD, &ss->tx.queue_active,
1686			       0, "tx_queue_active");
1687		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1688			       "tx_activate",
1689			       CTLFLAG_RD, &ss->tx.activate,
1690			       0, "tx_activate");
1691		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1692			       "tx_deactivate",
1693			       CTLFLAG_RD, &ss->tx.deactivate,
1694			       0, "tx_deactivate");
1695	}
1696}
1697
1698/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1699   backwards one at a time and handle ring wraps */
1700
1701static inline void
1702mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1703			    mcp_kreq_ether_send_t *src, int cnt)
1704{
1705	int idx, starting_slot;
1706	starting_slot = tx->req;
1707	while (cnt > 1) {
1708		cnt--;
1709		idx = (starting_slot + cnt) & tx->mask;
1710		mxge_pio_copy(&tx->lanai[idx],
1711			      &src[cnt], sizeof(*src));
1712		wmb();
1713	}
1714}
1715
1716/*
1717 * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1718 * at most 32 bytes at a time, so as to avoid involving the software
1719 * pio handler in the nic.   We re-write the first segment's flags
1720 * to mark them valid only after writing the entire chain
1721 */
1722
1723static inline void
1724mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1725		  int cnt)
1726{
1727	int idx, i;
1728	uint32_t *src_ints;
1729	volatile uint32_t *dst_ints;
1730	mcp_kreq_ether_send_t *srcp;
1731	volatile mcp_kreq_ether_send_t *dstp, *dst;
1732	uint8_t last_flags;
1733
1734	idx = tx->req & tx->mask;
1735
1736	last_flags = src->flags;
1737	src->flags = 0;
1738	wmb();
1739	dst = dstp = &tx->lanai[idx];
1740	srcp = src;
1741
1742	if ((idx + cnt) < tx->mask) {
1743		for (i = 0; i < (cnt - 1); i += 2) {
1744			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1745			wmb(); /* force write every 32 bytes */
1746			srcp += 2;
1747			dstp += 2;
1748		}
1749	} else {
1750		/* submit all but the first request, and ensure
1751		   that it is submitted below */
1752		mxge_submit_req_backwards(tx, src, cnt);
1753		i = 0;
1754	}
1755	if (i < cnt) {
1756		/* submit the first request */
1757		mxge_pio_copy(dstp, srcp, sizeof(*src));
1758		wmb(); /* barrier before setting valid flag */
1759	}
1760
1761	/* re-write the last 32-bits with the valid flags */
1762	src->flags = last_flags;
1763	src_ints = (uint32_t *)src;
1764	src_ints+=3;
1765	dst_ints = (volatile uint32_t *)dst;
1766	dst_ints+=3;
1767	*dst_ints =  *src_ints;
1768	tx->req += cnt;
1769	wmb();
1770}
1771
1772static int
1773mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1774    struct mxge_pkt_info *pi)
1775{
1776	struct ether_vlan_header *eh;
1777	uint16_t etype;
1778	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1779#if IFCAP_TSO6 && defined(INET6)
1780	int nxt;
1781#endif
1782
1783	eh = mtod(m, struct ether_vlan_header *);
1784	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1785		etype = ntohs(eh->evl_proto);
1786		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1787	} else {
1788		etype = ntohs(eh->evl_encap_proto);
1789		pi->ip_off = ETHER_HDR_LEN;
1790	}
1791
1792	switch (etype) {
1793	case ETHERTYPE_IP:
1794		/*
1795		 * ensure ip header is in first mbuf, copy it to a
1796		 * scratch buffer if not
1797		 */
1798		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1799		pi->ip6 = NULL;
1800		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1801			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1802			    ss->scratch);
1803			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1804		}
1805		pi->ip_hlen = pi->ip->ip_hl << 2;
1806		if (!tso)
1807			return 0;
1808
1809		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1810		    sizeof(struct tcphdr))) {
1811			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1812			    sizeof(struct tcphdr), ss->scratch);
1813			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1814		}
1815		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1816		break;
1817#if IFCAP_TSO6 && defined(INET6)
1818	case ETHERTYPE_IPV6:
1819		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1820		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1821			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1822			    ss->scratch);
1823			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1824		}
1825		nxt = 0;
1826		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1827		pi->ip_hlen -= pi->ip_off;
1828		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1829			return EINVAL;
1830
1831		if (!tso)
1832			return 0;
1833
1834		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1835			return EINVAL;
1836
1837		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1838		    sizeof(struct tcphdr))) {
1839			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1840			    sizeof(struct tcphdr), ss->scratch);
1841			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1842		}
1843		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1844		break;
1845#endif
1846	default:
1847		return EINVAL;
1848	}
1849	return 0;
1850}
1851
1852#if IFCAP_TSO4
1853
1854static void
1855mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1856	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1857{
1858	mxge_tx_ring_t *tx;
1859	mcp_kreq_ether_send_t *req;
1860	bus_dma_segment_t *seg;
1861	uint32_t low, high_swapped;
1862	int len, seglen, cum_len, cum_len_next;
1863	int next_is_first, chop, cnt, rdma_count, small;
1864	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1865	uint8_t flags, flags_next;
1866	static int once;
1867
1868	mss = m->m_pkthdr.tso_segsz;
1869
1870	/* negative cum_len signifies to the
1871	 * send loop that we are still in the
1872	 * header portion of the TSO packet.
1873	 */
1874
1875	cksum_offset = pi->ip_off + pi->ip_hlen;
1876	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1877
1878	/* TSO implies checksum offload on this hardware */
1879	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1880		/*
1881		 * If packet has full TCP csum, replace it with pseudo hdr
1882		 * sum that the NIC expects, otherwise the NIC will emit
1883		 * packets with bad TCP checksums.
1884		 */
1885		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1886		if (pi->ip6) {
1887#if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1888			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1889			sum = in6_cksum_pseudo(pi->ip6,
1890			    m->m_pkthdr.len - cksum_offset,
1891			    IPPROTO_TCP, 0);
1892#endif
1893		} else {
1894#ifdef INET
1895			m->m_pkthdr.csum_flags |= CSUM_TCP;
1896			sum = in_pseudo(pi->ip->ip_src.s_addr,
1897			    pi->ip->ip_dst.s_addr,
1898			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1899				    cksum_offset)));
1900#endif
1901		}
1902		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1903		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1904	}
1905	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1906
1907
1908	/* for TSO, pseudo_hdr_offset holds mss.
1909	 * The firmware figures out where to put
1910	 * the checksum by parsing the header. */
1911	pseudo_hdr_offset = htobe16(mss);
1912
1913	if (pi->ip6) {
1914		/*
1915		 * for IPv6 TSO, the "checksum offset" is re-purposed
1916		 * to store the TCP header len
1917		 */
1918		cksum_offset = (pi->tcp->th_off << 2);
1919	}
1920
1921	tx = &ss->tx;
1922	req = tx->req_list;
1923	seg = tx->seg_list;
1924	cnt = 0;
1925	rdma_count = 0;
1926	/* "rdma_count" is the number of RDMAs belonging to the
1927	 * current packet BEFORE the current send request. For
1928	 * non-TSO packets, this is equal to "count".
1929	 * For TSO packets, rdma_count needs to be reset
1930	 * to 0 after a segment cut.
1931	 *
1932	 * The rdma_count field of the send request is
1933	 * the number of RDMAs of the packet starting at
1934	 * that request. For TSO send requests with one ore more cuts
1935	 * in the middle, this is the number of RDMAs starting
1936	 * after the last cut in the request. All previous
1937	 * segments before the last cut implicitly have 1 RDMA.
1938	 *
1939	 * Since the number of RDMAs is not known beforehand,
1940	 * it must be filled-in retroactively - after each
1941	 * segmentation cut or at the end of the entire packet.
1942	 */
1943
1944	while (busdma_seg_cnt) {
1945		/* Break the busdma segment up into pieces*/
1946		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1947		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1948		len = seg->ds_len;
1949
1950		while (len) {
1951			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1952			seglen = len;
1953			cum_len_next = cum_len + seglen;
1954			(req-rdma_count)->rdma_count = rdma_count + 1;
1955			if (__predict_true(cum_len >= 0)) {
1956				/* payload */
1957				chop = (cum_len_next > mss);
1958				cum_len_next = cum_len_next % mss;
1959				next_is_first = (cum_len_next == 0);
1960				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1961				flags_next |= next_is_first *
1962					MXGEFW_FLAGS_FIRST;
1963				rdma_count |= -(chop | next_is_first);
1964				rdma_count += chop & !next_is_first;
1965			} else if (cum_len_next >= 0) {
1966				/* header ends */
1967				rdma_count = -1;
1968				cum_len_next = 0;
1969				seglen = -cum_len;
1970				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1971				flags_next = MXGEFW_FLAGS_TSO_PLD |
1972					MXGEFW_FLAGS_FIRST |
1973					(small * MXGEFW_FLAGS_SMALL);
1974			    }
1975
1976			req->addr_high = high_swapped;
1977			req->addr_low = htobe32(low);
1978			req->pseudo_hdr_offset = pseudo_hdr_offset;
1979			req->pad = 0;
1980			req->rdma_count = 1;
1981			req->length = htobe16(seglen);
1982			req->cksum_offset = cksum_offset;
1983			req->flags = flags | ((cum_len & 1) *
1984					      MXGEFW_FLAGS_ALIGN_ODD);
1985			low += seglen;
1986			len -= seglen;
1987			cum_len = cum_len_next;
1988			flags = flags_next;
1989			req++;
1990			cnt++;
1991			rdma_count++;
1992			if (cksum_offset != 0 && !pi->ip6) {
1993				if (__predict_false(cksum_offset > seglen))
1994					cksum_offset -= seglen;
1995				else
1996					cksum_offset = 0;
1997			}
1998			if (__predict_false(cnt > tx->max_desc))
1999				goto drop;
2000		}
2001		busdma_seg_cnt--;
2002		seg++;
2003	}
2004	(req-rdma_count)->rdma_count = rdma_count;
2005
2006	do {
2007		req--;
2008		req->flags |= MXGEFW_FLAGS_TSO_LAST;
2009	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2010
2011	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2012	mxge_submit_req(tx, tx->req_list, cnt);
2013#ifdef IFNET_BUF_RING
2014	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2015		/* tell the NIC to start polling this slice */
2016		*tx->send_go = 1;
2017		tx->queue_active = 1;
2018		tx->activate++;
2019		wmb();
2020	}
2021#endif
2022	return;
2023
2024drop:
2025	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2026	m_freem(m);
2027	ss->oerrors++;
2028	if (!once) {
2029		printf("tx->max_desc exceeded via TSO!\n");
2030		printf("mss = %d, %ld, %d!\n", mss,
2031		       (long)seg - (long)tx->seg_list, tx->max_desc);
2032		once = 1;
2033	}
2034	return;
2035
2036}
2037
2038#endif /* IFCAP_TSO4 */
2039
2040#ifdef MXGE_NEW_VLAN_API
2041/*
2042 * We reproduce the software vlan tag insertion from
2043 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2044 * vlan tag insertion. We need to advertise this in order to have the
2045 * vlan interface respect our csum offload flags.
2046 */
2047static struct mbuf *
2048mxge_vlan_tag_insert(struct mbuf *m)
2049{
2050	struct ether_vlan_header *evl;
2051
2052	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2053	if (__predict_false(m == NULL))
2054		return NULL;
2055	if (m->m_len < sizeof(*evl)) {
2056		m = m_pullup(m, sizeof(*evl));
2057		if (__predict_false(m == NULL))
2058			return NULL;
2059	}
2060	/*
2061	 * Transform the Ethernet header into an Ethernet header
2062	 * with 802.1Q encapsulation.
2063	 */
2064	evl = mtod(m, struct ether_vlan_header *);
2065	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2066	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2067	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2068	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2069	m->m_flags &= ~M_VLANTAG;
2070	return m;
2071}
2072#endif /* MXGE_NEW_VLAN_API */
2073
2074static void
2075mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2076{
2077	struct mxge_pkt_info pi = {0,0,0,0};
2078	mxge_softc_t *sc;
2079	mcp_kreq_ether_send_t *req;
2080	bus_dma_segment_t *seg;
2081	struct mbuf *m_tmp;
2082	struct ifnet *ifp;
2083	mxge_tx_ring_t *tx;
2084	int cnt, cum_len, err, i, idx, odd_flag;
2085	uint16_t pseudo_hdr_offset;
2086	uint8_t flags, cksum_offset;
2087
2088
2089	sc = ss->sc;
2090	ifp = sc->ifp;
2091	tx = &ss->tx;
2092
2093#ifdef MXGE_NEW_VLAN_API
2094	if (m->m_flags & M_VLANTAG) {
2095		m = mxge_vlan_tag_insert(m);
2096		if (__predict_false(m == NULL))
2097			goto drop_without_m;
2098	}
2099#endif
2100	if (m->m_pkthdr.csum_flags &
2101	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2102		if (mxge_parse_tx(ss, m, &pi))
2103			goto drop;
2104	}
2105
2106	/* (try to) map the frame for DMA */
2107	idx = tx->req & tx->mask;
2108	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2109				      m, tx->seg_list, &cnt,
2110				      BUS_DMA_NOWAIT);
2111	if (__predict_false(err == EFBIG)) {
2112		/* Too many segments in the chain.  Try
2113		   to defrag */
2114		m_tmp = m_defrag(m, M_NOWAIT);
2115		if (m_tmp == NULL) {
2116			goto drop;
2117		}
2118		ss->tx.defrag++;
2119		m = m_tmp;
2120		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2121					      tx->info[idx].map,
2122					      m, tx->seg_list, &cnt,
2123					      BUS_DMA_NOWAIT);
2124	}
2125	if (__predict_false(err != 0)) {
2126		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2127			      " packet len = %d\n", err, m->m_pkthdr.len);
2128		goto drop;
2129	}
2130	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2131			BUS_DMASYNC_PREWRITE);
2132	tx->info[idx].m = m;
2133
2134#if IFCAP_TSO4
2135	/* TSO is different enough, we handle it in another routine */
2136	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2137		mxge_encap_tso(ss, m, cnt, &pi);
2138		return;
2139	}
2140#endif
2141
2142	req = tx->req_list;
2143	cksum_offset = 0;
2144	pseudo_hdr_offset = 0;
2145	flags = MXGEFW_FLAGS_NO_TSO;
2146
2147	/* checksum offloading? */
2148	if (m->m_pkthdr.csum_flags &
2149	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2150		/* ensure ip header is in first mbuf, copy
2151		   it to a scratch buffer if not */
2152		cksum_offset = pi.ip_off + pi.ip_hlen;
2153		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2154		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2155		req->cksum_offset = cksum_offset;
2156		flags |= MXGEFW_FLAGS_CKSUM;
2157		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2158	} else {
2159		odd_flag = 0;
2160	}
2161	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2162		flags |= MXGEFW_FLAGS_SMALL;
2163
2164	/* convert segments into a request list */
2165	cum_len = 0;
2166	seg = tx->seg_list;
2167	req->flags = MXGEFW_FLAGS_FIRST;
2168	for (i = 0; i < cnt; i++) {
2169		req->addr_low =
2170			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2171		req->addr_high =
2172			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2173		req->length = htobe16(seg->ds_len);
2174		req->cksum_offset = cksum_offset;
2175		if (cksum_offset > seg->ds_len)
2176			cksum_offset -= seg->ds_len;
2177		else
2178			cksum_offset = 0;
2179		req->pseudo_hdr_offset = pseudo_hdr_offset;
2180		req->pad = 0; /* complete solid 16-byte block */
2181		req->rdma_count = 1;
2182		req->flags |= flags | ((cum_len & 1) * odd_flag);
2183		cum_len += seg->ds_len;
2184		seg++;
2185		req++;
2186		req->flags = 0;
2187	}
2188	req--;
2189	/* pad runts to 60 bytes */
2190	if (cum_len < 60) {
2191		req++;
2192		req->addr_low =
2193			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2194		req->addr_high =
2195			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2196		req->length = htobe16(60 - cum_len);
2197		req->cksum_offset = 0;
2198		req->pseudo_hdr_offset = pseudo_hdr_offset;
2199		req->pad = 0; /* complete solid 16-byte block */
2200		req->rdma_count = 1;
2201		req->flags |= flags | ((cum_len & 1) * odd_flag);
2202		cnt++;
2203	}
2204
2205	tx->req_list[0].rdma_count = cnt;
2206#if 0
2207	/* print what the firmware will see */
2208	for (i = 0; i < cnt; i++) {
2209		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2210		    "cso:%d, flags:0x%x, rdma:%d\n",
2211		    i, (int)ntohl(tx->req_list[i].addr_high),
2212		    (int)ntohl(tx->req_list[i].addr_low),
2213		    (int)ntohs(tx->req_list[i].length),
2214		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2215		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2216		    tx->req_list[i].rdma_count);
2217	}
2218	printf("--------------\n");
2219#endif
2220	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2221	mxge_submit_req(tx, tx->req_list, cnt);
2222#ifdef IFNET_BUF_RING
2223	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2224		/* tell the NIC to start polling this slice */
2225		*tx->send_go = 1;
2226		tx->queue_active = 1;
2227		tx->activate++;
2228		wmb();
2229	}
2230#endif
2231	return;
2232
2233drop:
2234	m_freem(m);
2235drop_without_m:
2236	ss->oerrors++;
2237	return;
2238}
2239
2240#ifdef IFNET_BUF_RING
2241static void
2242mxge_qflush(struct ifnet *ifp)
2243{
2244	mxge_softc_t *sc = ifp->if_softc;
2245	mxge_tx_ring_t *tx;
2246	struct mbuf *m;
2247	int slice;
2248
2249	for (slice = 0; slice < sc->num_slices; slice++) {
2250		tx = &sc->ss[slice].tx;
2251		mtx_lock(&tx->mtx);
2252		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2253			m_freem(m);
2254		mtx_unlock(&tx->mtx);
2255	}
2256	if_qflush(ifp);
2257}
2258
2259static inline void
2260mxge_start_locked(struct mxge_slice_state *ss)
2261{
2262	mxge_softc_t *sc;
2263	struct mbuf *m;
2264	struct ifnet *ifp;
2265	mxge_tx_ring_t *tx;
2266
2267	sc = ss->sc;
2268	ifp = sc->ifp;
2269	tx = &ss->tx;
2270
2271	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2272		m = drbr_dequeue(ifp, tx->br);
2273		if (m == NULL) {
2274			return;
2275		}
2276		/* let BPF see it */
2277		BPF_MTAP(ifp, m);
2278
2279		/* give it to the nic */
2280		mxge_encap(ss, m);
2281	}
2282	/* ran out of transmit slots */
2283	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2284	    && (!drbr_empty(ifp, tx->br))) {
2285		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2286		tx->stall++;
2287	}
2288}
2289
2290static int
2291mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2292{
2293	mxge_softc_t *sc;
2294	struct ifnet *ifp;
2295	mxge_tx_ring_t *tx;
2296	int err;
2297
2298	sc = ss->sc;
2299	ifp = sc->ifp;
2300	tx = &ss->tx;
2301
2302	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2303	    IFF_DRV_RUNNING) {
2304		err = drbr_enqueue(ifp, tx->br, m);
2305		return (err);
2306	}
2307
2308	if (!drbr_needs_enqueue(ifp, tx->br) &&
2309	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2310		/* let BPF see it */
2311		BPF_MTAP(ifp, m);
2312		/* give it to the nic */
2313		mxge_encap(ss, m);
2314	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2315		return (err);
2316	}
2317	if (!drbr_empty(ifp, tx->br))
2318		mxge_start_locked(ss);
2319	return (0);
2320}
2321
2322static int
2323mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2324{
2325	mxge_softc_t *sc = ifp->if_softc;
2326	struct mxge_slice_state *ss;
2327	mxge_tx_ring_t *tx;
2328	int err = 0;
2329	int slice;
2330
2331	slice = m->m_pkthdr.flowid;
2332	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2333
2334	ss = &sc->ss[slice];
2335	tx = &ss->tx;
2336
2337	if (mtx_trylock(&tx->mtx)) {
2338		err = mxge_transmit_locked(ss, m);
2339		mtx_unlock(&tx->mtx);
2340	} else {
2341		err = drbr_enqueue(ifp, tx->br, m);
2342	}
2343
2344	return (err);
2345}
2346
2347#else
2348
2349static inline void
2350mxge_start_locked(struct mxge_slice_state *ss)
2351{
2352	mxge_softc_t *sc;
2353	struct mbuf *m;
2354	struct ifnet *ifp;
2355	mxge_tx_ring_t *tx;
2356
2357	sc = ss->sc;
2358	ifp = sc->ifp;
2359	tx = &ss->tx;
2360	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2361		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2362		if (m == NULL) {
2363			return;
2364		}
2365		/* let BPF see it */
2366		BPF_MTAP(ifp, m);
2367
2368		/* give it to the nic */
2369		mxge_encap(ss, m);
2370	}
2371	/* ran out of transmit slots */
2372	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2373		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2374		tx->stall++;
2375	}
2376}
2377#endif
2378static void
2379mxge_start(struct ifnet *ifp)
2380{
2381	mxge_softc_t *sc = ifp->if_softc;
2382	struct mxge_slice_state *ss;
2383
2384	/* only use the first slice for now */
2385	ss = &sc->ss[0];
2386	mtx_lock(&ss->tx.mtx);
2387	mxge_start_locked(ss);
2388	mtx_unlock(&ss->tx.mtx);
2389}
2390
2391/*
2392 * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2393 * at most 32 bytes at a time, so as to avoid involving the software
2394 * pio handler in the nic.   We re-write the first segment's low
2395 * DMA address to mark it valid only after we write the entire chunk
2396 * in a burst
2397 */
2398static inline void
2399mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2400		mcp_kreq_ether_recv_t *src)
2401{
2402	uint32_t low;
2403
2404	low = src->addr_low;
2405	src->addr_low = 0xffffffff;
2406	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2407	wmb();
2408	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2409	wmb();
2410	src->addr_low = low;
2411	dst->addr_low = low;
2412	wmb();
2413}
2414
2415static int
2416mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2417{
2418	bus_dma_segment_t seg;
2419	struct mbuf *m;
2420	mxge_rx_ring_t *rx = &ss->rx_small;
2421	int cnt, err;
2422
2423	m = m_gethdr(M_NOWAIT, MT_DATA);
2424	if (m == NULL) {
2425		rx->alloc_fail++;
2426		err = ENOBUFS;
2427		goto done;
2428	}
2429	m->m_len = MHLEN;
2430	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2431				      &seg, &cnt, BUS_DMA_NOWAIT);
2432	if (err != 0) {
2433		m_free(m);
2434		goto done;
2435	}
2436	rx->info[idx].m = m;
2437	rx->shadow[idx].addr_low =
2438		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2439	rx->shadow[idx].addr_high =
2440		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2441
2442done:
2443	if ((idx & 7) == 7)
2444		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2445	return err;
2446}
2447
2448static int
2449mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2450{
2451	bus_dma_segment_t seg[3];
2452	struct mbuf *m;
2453	mxge_rx_ring_t *rx = &ss->rx_big;
2454	int cnt, err, i;
2455
2456	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2457	if (m == NULL) {
2458		rx->alloc_fail++;
2459		err = ENOBUFS;
2460		goto done;
2461	}
2462	m->m_len = rx->mlen;
2463	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2464				      seg, &cnt, BUS_DMA_NOWAIT);
2465	if (err != 0) {
2466		m_free(m);
2467		goto done;
2468	}
2469	rx->info[idx].m = m;
2470	rx->shadow[idx].addr_low =
2471		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2472	rx->shadow[idx].addr_high =
2473		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2474
2475#if MXGE_VIRT_JUMBOS
2476	for (i = 1; i < cnt; i++) {
2477		rx->shadow[idx + i].addr_low =
2478			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2479		rx->shadow[idx + i].addr_high =
2480			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2481       }
2482#endif
2483
2484done:
2485       for (i = 0; i < rx->nbufs; i++) {
2486		if ((idx & 7) == 7) {
2487			mxge_submit_8rx(&rx->lanai[idx - 7],
2488					&rx->shadow[idx - 7]);
2489		}
2490		idx++;
2491	}
2492	return err;
2493}
2494
2495#ifdef INET6
2496
2497static uint16_t
2498mxge_csum_generic(uint16_t *raw, int len)
2499{
2500	uint32_t csum;
2501
2502
2503	csum = 0;
2504	while (len > 0) {
2505		csum += *raw;
2506		raw++;
2507		len -= 2;
2508	}
2509	csum = (csum >> 16) + (csum & 0xffff);
2510	csum = (csum >> 16) + (csum & 0xffff);
2511	return (uint16_t)csum;
2512}
2513
2514static inline uint16_t
2515mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2516{
2517	uint32_t partial;
2518	int nxt, cksum_offset;
2519	struct ip6_hdr *ip6 = p;
2520	uint16_t c;
2521
2522	nxt = ip6->ip6_nxt;
2523	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2524	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2525		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2526					   IPPROTO_IPV6, &nxt);
2527		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2528			return (1);
2529	}
2530
2531	/*
2532	 * IPv6 headers do not contain a checksum, and hence
2533	 * do not checksum to zero, so they don't "fall out"
2534	 * of the partial checksum calculation like IPv4
2535	 * headers do.  We need to fix the partial checksum by
2536	 * subtracting the checksum of the IPv6 header.
2537	 */
2538
2539	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2540				    ETHER_HDR_LEN);
2541	csum += ~partial;
2542	csum +=	 (csum < ~partial);
2543	csum = (csum >> 16) + (csum & 0xFFFF);
2544	csum = (csum >> 16) + (csum & 0xFFFF);
2545	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2546			     csum);
2547	c ^= 0xffff;
2548	return (c);
2549}
2550#endif /* INET6 */
2551/*
2552 *  Myri10GE hardware checksums are not valid if the sender
2553 *  padded the frame with non-zero padding.  This is because
2554 *  the firmware just does a simple 16-bit 1s complement
2555 *  checksum across the entire frame, excluding the first 14
2556 *  bytes.  It is best to simply to check the checksum and
2557 *  tell the stack about it only if the checksum is good
2558 */
2559
2560static inline uint16_t
2561mxge_rx_csum(struct mbuf *m, int csum)
2562{
2563	struct ether_header *eh;
2564#ifdef INET
2565	struct ip *ip;
2566#endif
2567#if defined(INET) || defined(INET6)
2568	int cap = m->m_pkthdr.rcvif->if_capenable;
2569#endif
2570	uint16_t c, etype;
2571
2572
2573	eh = mtod(m, struct ether_header *);
2574	etype = ntohs(eh->ether_type);
2575	switch (etype) {
2576#ifdef INET
2577	case ETHERTYPE_IP:
2578		if ((cap & IFCAP_RXCSUM) == 0)
2579			return (1);
2580		ip = (struct ip *)(eh + 1);
2581		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2582			return (1);
2583		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2584			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2585				    (ip->ip_hl << 2) + ip->ip_p));
2586		c ^= 0xffff;
2587		break;
2588#endif
2589#ifdef INET6
2590	case ETHERTYPE_IPV6:
2591		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2592			return (1);
2593		c = mxge_rx_csum6((eh + 1), m, csum);
2594		break;
2595#endif
2596	default:
2597		c = 1;
2598	}
2599	return (c);
2600}
2601
2602static void
2603mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2604{
2605	struct ether_vlan_header *evl;
2606	struct ether_header *eh;
2607	uint32_t partial;
2608
2609	evl = mtod(m, struct ether_vlan_header *);
2610	eh = mtod(m, struct ether_header *);
2611
2612	/*
2613	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2614	 * after what the firmware thought was the end of the ethernet
2615	 * header.
2616	 */
2617
2618	/* put checksum into host byte order */
2619	*csum = ntohs(*csum);
2620	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2621	(*csum) += ~partial;
2622	(*csum) +=  ((*csum) < ~partial);
2623	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2624	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2625
2626	/* restore checksum to network byte order;
2627	   later consumers expect this */
2628	*csum = htons(*csum);
2629
2630	/* save the tag */
2631#ifdef MXGE_NEW_VLAN_API
2632	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2633#else
2634	{
2635		struct m_tag *mtag;
2636		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2637				   M_NOWAIT);
2638		if (mtag == NULL)
2639			return;
2640		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2641		m_tag_prepend(m, mtag);
2642	}
2643
2644#endif
2645	m->m_flags |= M_VLANTAG;
2646
2647	/*
2648	 * Remove the 802.1q header by copying the Ethernet
2649	 * addresses over it and adjusting the beginning of
2650	 * the data in the mbuf.  The encapsulated Ethernet
2651	 * type field is already in place.
2652	 */
2653	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2654	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2655	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2656}
2657
2658
2659static inline void
2660mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2661		 uint32_t csum, int lro)
2662{
2663	mxge_softc_t *sc;
2664	struct ifnet *ifp;
2665	struct mbuf *m;
2666	struct ether_header *eh;
2667	mxge_rx_ring_t *rx;
2668	bus_dmamap_t old_map;
2669	int idx;
2670
2671	sc = ss->sc;
2672	ifp = sc->ifp;
2673	rx = &ss->rx_big;
2674	idx = rx->cnt & rx->mask;
2675	rx->cnt += rx->nbufs;
2676	/* save a pointer to the received mbuf */
2677	m = rx->info[idx].m;
2678	/* try to replace the received mbuf */
2679	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2680		/* drop the frame -- the old mbuf is re-cycled */
2681		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2682		return;
2683	}
2684
2685	/* unmap the received buffer */
2686	old_map = rx->info[idx].map;
2687	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2688	bus_dmamap_unload(rx->dmat, old_map);
2689
2690	/* swap the bus_dmamap_t's */
2691	rx->info[idx].map = rx->extra_map;
2692	rx->extra_map = old_map;
2693
2694	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2695	 * aligned */
2696	m->m_data += MXGEFW_PAD;
2697
2698	m->m_pkthdr.rcvif = ifp;
2699	m->m_len = m->m_pkthdr.len = len;
2700	ss->ipackets++;
2701	eh = mtod(m, struct ether_header *);
2702	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2703		mxge_vlan_tag_remove(m, &csum);
2704	}
2705	/* if the checksum is valid, mark it in the mbuf header */
2706
2707	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2708	    (0 == mxge_rx_csum(m, csum))) {
2709		/* Tell the stack that the  checksum is good */
2710		m->m_pkthdr.csum_data = 0xffff;
2711		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2712			CSUM_DATA_VALID;
2713
2714#if defined(INET) || defined (INET6)
2715		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2716			return;
2717#endif
2718	}
2719	/* flowid only valid if RSS hashing is enabled */
2720	if (sc->num_slices > 1) {
2721		m->m_pkthdr.flowid = (ss - sc->ss);
2722		m->m_flags |= M_FLOWID;
2723	}
2724	/* pass the frame up the stack */
2725	(*ifp->if_input)(ifp, m);
2726}
2727
2728static inline void
2729mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2730		   uint32_t csum, int lro)
2731{
2732	mxge_softc_t *sc;
2733	struct ifnet *ifp;
2734	struct ether_header *eh;
2735	struct mbuf *m;
2736	mxge_rx_ring_t *rx;
2737	bus_dmamap_t old_map;
2738	int idx;
2739
2740	sc = ss->sc;
2741	ifp = sc->ifp;
2742	rx = &ss->rx_small;
2743	idx = rx->cnt & rx->mask;
2744	rx->cnt++;
2745	/* save a pointer to the received mbuf */
2746	m = rx->info[idx].m;
2747	/* try to replace the received mbuf */
2748	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2749		/* drop the frame -- the old mbuf is re-cycled */
2750		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2751		return;
2752	}
2753
2754	/* unmap the received buffer */
2755	old_map = rx->info[idx].map;
2756	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2757	bus_dmamap_unload(rx->dmat, old_map);
2758
2759	/* swap the bus_dmamap_t's */
2760	rx->info[idx].map = rx->extra_map;
2761	rx->extra_map = old_map;
2762
2763	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2764	 * aligned */
2765	m->m_data += MXGEFW_PAD;
2766
2767	m->m_pkthdr.rcvif = ifp;
2768	m->m_len = m->m_pkthdr.len = len;
2769	ss->ipackets++;
2770	eh = mtod(m, struct ether_header *);
2771	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2772		mxge_vlan_tag_remove(m, &csum);
2773	}
2774	/* if the checksum is valid, mark it in the mbuf header */
2775	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2776	    (0 == mxge_rx_csum(m, csum))) {
2777		/* Tell the stack that the  checksum is good */
2778		m->m_pkthdr.csum_data = 0xffff;
2779		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2780			CSUM_DATA_VALID;
2781
2782#if defined(INET) || defined (INET6)
2783		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2784			return;
2785#endif
2786	}
2787	/* flowid only valid if RSS hashing is enabled */
2788	if (sc->num_slices > 1) {
2789		m->m_pkthdr.flowid = (ss - sc->ss);
2790		m->m_flags |= M_FLOWID;
2791	}
2792	/* pass the frame up the stack */
2793	(*ifp->if_input)(ifp, m);
2794}
2795
2796static inline void
2797mxge_clean_rx_done(struct mxge_slice_state *ss)
2798{
2799	mxge_rx_done_t *rx_done = &ss->rx_done;
2800	int limit = 0;
2801	uint16_t length;
2802	uint16_t checksum;
2803	int lro;
2804
2805	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2806	while (rx_done->entry[rx_done->idx].length != 0) {
2807		length = ntohs(rx_done->entry[rx_done->idx].length);
2808		rx_done->entry[rx_done->idx].length = 0;
2809		checksum = rx_done->entry[rx_done->idx].checksum;
2810		if (length <= (MHLEN - MXGEFW_PAD))
2811			mxge_rx_done_small(ss, length, checksum, lro);
2812		else
2813			mxge_rx_done_big(ss, length, checksum, lro);
2814		rx_done->cnt++;
2815		rx_done->idx = rx_done->cnt & rx_done->mask;
2816
2817		/* limit potential for livelock */
2818		if (__predict_false(++limit > rx_done->mask / 2))
2819			break;
2820	}
2821#if defined(INET)  || defined (INET6)
2822	while (!SLIST_EMPTY(&ss->lc.lro_active)) {
2823		struct lro_entry *lro = SLIST_FIRST(&ss->lc.lro_active);
2824		SLIST_REMOVE_HEAD(&ss->lc.lro_active, next);
2825		tcp_lro_flush(&ss->lc, lro);
2826	}
2827#endif
2828}
2829
2830
2831static inline void
2832mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2833{
2834	struct ifnet *ifp;
2835	mxge_tx_ring_t *tx;
2836	struct mbuf *m;
2837	bus_dmamap_t map;
2838	int idx;
2839	int *flags;
2840
2841	tx = &ss->tx;
2842	ifp = ss->sc->ifp;
2843	while (tx->pkt_done != mcp_idx) {
2844		idx = tx->done & tx->mask;
2845		tx->done++;
2846		m = tx->info[idx].m;
2847		/* mbuf and DMA map only attached to the first
2848		   segment per-mbuf */
2849		if (m != NULL) {
2850			ss->obytes += m->m_pkthdr.len;
2851			if (m->m_flags & M_MCAST)
2852				ss->omcasts++;
2853			ss->opackets++;
2854			tx->info[idx].m = NULL;
2855			map = tx->info[idx].map;
2856			bus_dmamap_unload(tx->dmat, map);
2857			m_freem(m);
2858		}
2859		if (tx->info[idx].flag) {
2860			tx->info[idx].flag = 0;
2861			tx->pkt_done++;
2862		}
2863	}
2864
2865	/* If we have space, clear IFF_OACTIVE to tell the stack that
2866	   its OK to send packets */
2867#ifdef IFNET_BUF_RING
2868	flags = &ss->if_drv_flags;
2869#else
2870	flags = &ifp->if_drv_flags;
2871#endif
2872	mtx_lock(&ss->tx.mtx);
2873	if ((*flags) & IFF_DRV_OACTIVE &&
2874	    tx->req - tx->done < (tx->mask + 1)/4) {
2875		*(flags) &= ~IFF_DRV_OACTIVE;
2876		ss->tx.wake++;
2877		mxge_start_locked(ss);
2878	}
2879#ifdef IFNET_BUF_RING
2880	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2881		/* let the NIC stop polling this queue, since there
2882		 * are no more transmits pending */
2883		if (tx->req == tx->done) {
2884			*tx->send_stop = 1;
2885			tx->queue_active = 0;
2886			tx->deactivate++;
2887			wmb();
2888		}
2889	}
2890#endif
2891	mtx_unlock(&ss->tx.mtx);
2892
2893}
2894
2895static struct mxge_media_type mxge_xfp_media_types[] =
2896{
2897	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2898	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2899	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2900	{0,		(1 << 5),	"10GBASE-ER"},
2901	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2902	{0,		(1 << 3),	"10GBASE-SW"},
2903	{0,		(1 << 2),	"10GBASE-LW"},
2904	{0,		(1 << 1),	"10GBASE-EW"},
2905	{0,		(1 << 0),	"Reserved"}
2906};
2907static struct mxge_media_type mxge_sfp_media_types[] =
2908{
2909	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2910	{0,		(1 << 7),	"Reserved"},
2911	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2912	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2913	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2914	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2915};
2916
2917static void
2918mxge_media_set(mxge_softc_t *sc, int media_type)
2919{
2920
2921
2922	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2923		    0, NULL);
2924	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2925	sc->current_media = media_type;
2926	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2927}
2928
2929static void
2930mxge_media_init(mxge_softc_t *sc)
2931{
2932	char *ptr;
2933	int i;
2934
2935	ifmedia_removeall(&sc->media);
2936	mxge_media_set(sc, IFM_AUTO);
2937
2938	/*
2939	 * parse the product code to deterimine the interface type
2940	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2941	 * after the 3rd dash in the driver's cached copy of the
2942	 * EEPROM's product code string.
2943	 */
2944	ptr = sc->product_code_string;
2945	if (ptr == NULL) {
2946		device_printf(sc->dev, "Missing product code\n");
2947		return;
2948	}
2949
2950	for (i = 0; i < 3; i++, ptr++) {
2951		ptr = strchr(ptr, '-');
2952		if (ptr == NULL) {
2953			device_printf(sc->dev,
2954				      "only %d dashes in PC?!?\n", i);
2955			return;
2956		}
2957	}
2958	if (*ptr == 'C' || *(ptr +1) == 'C') {
2959		/* -C is CX4 */
2960		sc->connector = MXGE_CX4;
2961		mxge_media_set(sc, IFM_10G_CX4);
2962	} else if (*ptr == 'Q') {
2963		/* -Q is Quad Ribbon Fiber */
2964		sc->connector = MXGE_QRF;
2965		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2966		/* FreeBSD has no media type for Quad ribbon fiber */
2967	} else if (*ptr == 'R') {
2968		/* -R is XFP */
2969		sc->connector = MXGE_XFP;
2970	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2971		/* -S or -2S is SFP+ */
2972		sc->connector = MXGE_SFP;
2973	} else {
2974		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2975	}
2976}
2977
2978/*
2979 * Determine the media type for a NIC.  Some XFPs will identify
2980 * themselves only when their link is up, so this is initiated via a
2981 * link up interrupt.  However, this can potentially take up to
2982 * several milliseconds, so it is run via the watchdog routine, rather
2983 * than in the interrupt handler itself.
2984 */
2985static void
2986mxge_media_probe(mxge_softc_t *sc)
2987{
2988	mxge_cmd_t cmd;
2989	char *cage_type;
2990
2991	struct mxge_media_type *mxge_media_types = NULL;
2992	int i, err, ms, mxge_media_type_entries;
2993	uint32_t byte;
2994
2995	sc->need_media_probe = 0;
2996
2997	if (sc->connector == MXGE_XFP) {
2998		/* -R is XFP */
2999		mxge_media_types = mxge_xfp_media_types;
3000		mxge_media_type_entries =
3001			sizeof (mxge_xfp_media_types) /
3002			sizeof (mxge_xfp_media_types[0]);
3003		byte = MXGE_XFP_COMPLIANCE_BYTE;
3004		cage_type = "XFP";
3005	} else 	if (sc->connector == MXGE_SFP) {
3006		/* -S or -2S is SFP+ */
3007		mxge_media_types = mxge_sfp_media_types;
3008		mxge_media_type_entries =
3009			sizeof (mxge_sfp_media_types) /
3010			sizeof (mxge_sfp_media_types[0]);
3011		cage_type = "SFP+";
3012		byte = 3;
3013	} else {
3014		/* nothing to do; media type cannot change */
3015		return;
3016	}
3017
3018	/*
3019	 * At this point we know the NIC has an XFP cage, so now we
3020	 * try to determine what is in the cage by using the
3021	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
3022	 * register.  We read just one byte, which may take over
3023	 * a millisecond
3024	 */
3025
3026	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3027	cmd.data1 = byte;
3028	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3029	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3030		device_printf(sc->dev, "failed to read XFP\n");
3031	}
3032	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3033		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3034	}
3035	if (err != MXGEFW_CMD_OK) {
3036		return;
3037	}
3038
3039	/* now we wait for the data to be cached */
3040	cmd.data0 = byte;
3041	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3042	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3043		DELAY(1000);
3044		cmd.data0 = byte;
3045		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3046	}
3047	if (err != MXGEFW_CMD_OK) {
3048		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3049			      cage_type, err, ms);
3050		return;
3051	}
3052
3053	if (cmd.data0 == mxge_media_types[0].bitmask) {
3054		if (mxge_verbose)
3055			device_printf(sc->dev, "%s:%s\n", cage_type,
3056				      mxge_media_types[0].name);
3057		if (sc->current_media != mxge_media_types[0].flag) {
3058			mxge_media_init(sc);
3059			mxge_media_set(sc, mxge_media_types[0].flag);
3060		}
3061		return;
3062	}
3063	for (i = 1; i < mxge_media_type_entries; i++) {
3064		if (cmd.data0 & mxge_media_types[i].bitmask) {
3065			if (mxge_verbose)
3066				device_printf(sc->dev, "%s:%s\n",
3067					      cage_type,
3068					      mxge_media_types[i].name);
3069
3070			if (sc->current_media != mxge_media_types[i].flag) {
3071				mxge_media_init(sc);
3072				mxge_media_set(sc, mxge_media_types[i].flag);
3073			}
3074			return;
3075		}
3076	}
3077	if (mxge_verbose)
3078		device_printf(sc->dev, "%s media 0x%x unknown\n",
3079			      cage_type, cmd.data0);
3080
3081	return;
3082}
3083
3084static void
3085mxge_intr(void *arg)
3086{
3087	struct mxge_slice_state *ss = arg;
3088	mxge_softc_t *sc = ss->sc;
3089	mcp_irq_data_t *stats = ss->fw_stats;
3090	mxge_tx_ring_t *tx = &ss->tx;
3091	mxge_rx_done_t *rx_done = &ss->rx_done;
3092	uint32_t send_done_count;
3093	uint8_t valid;
3094
3095
3096#ifndef IFNET_BUF_RING
3097	/* an interrupt on a non-zero slice is implicitly valid
3098	   since MSI-X irqs are not shared */
3099	if (ss != sc->ss) {
3100		mxge_clean_rx_done(ss);
3101		*ss->irq_claim = be32toh(3);
3102		return;
3103	}
3104#endif
3105
3106	/* make sure the DMA has finished */
3107	if (!stats->valid) {
3108		return;
3109	}
3110	valid = stats->valid;
3111
3112	if (sc->legacy_irq) {
3113		/* lower legacy IRQ  */
3114		*sc->irq_deassert = 0;
3115		if (!mxge_deassert_wait)
3116			/* don't wait for conf. that irq is low */
3117			stats->valid = 0;
3118	} else {
3119		stats->valid = 0;
3120	}
3121
3122	/* loop while waiting for legacy irq deassertion */
3123	do {
3124		/* check for transmit completes and receives */
3125		send_done_count = be32toh(stats->send_done_count);
3126		while ((send_done_count != tx->pkt_done) ||
3127		       (rx_done->entry[rx_done->idx].length != 0)) {
3128			if (send_done_count != tx->pkt_done)
3129				mxge_tx_done(ss, (int)send_done_count);
3130			mxge_clean_rx_done(ss);
3131			send_done_count = be32toh(stats->send_done_count);
3132		}
3133		if (sc->legacy_irq && mxge_deassert_wait)
3134			wmb();
3135	} while (*((volatile uint8_t *) &stats->valid));
3136
3137	/* fw link & error stats meaningful only on the first slice */
3138	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3139		if (sc->link_state != stats->link_up) {
3140			sc->link_state = stats->link_up;
3141			if (sc->link_state) {
3142				if_link_state_change(sc->ifp, LINK_STATE_UP);
3143				if (mxge_verbose)
3144					device_printf(sc->dev, "link up\n");
3145			} else {
3146				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3147				if (mxge_verbose)
3148					device_printf(sc->dev, "link down\n");
3149			}
3150			sc->need_media_probe = 1;
3151		}
3152		if (sc->rdma_tags_available !=
3153		    be32toh(stats->rdma_tags_available)) {
3154			sc->rdma_tags_available =
3155				be32toh(stats->rdma_tags_available);
3156			device_printf(sc->dev, "RDMA timed out! %d tags "
3157				      "left\n", sc->rdma_tags_available);
3158		}
3159
3160		if (stats->link_down) {
3161			sc->down_cnt += stats->link_down;
3162			sc->link_state = 0;
3163			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3164		}
3165	}
3166
3167	/* check to see if we have rx token to pass back */
3168	if (valid & 0x1)
3169	    *ss->irq_claim = be32toh(3);
3170	*(ss->irq_claim + 1) = be32toh(3);
3171}
3172
3173static void
3174mxge_init(void *arg)
3175{
3176	mxge_softc_t *sc = arg;
3177	struct ifnet *ifp = sc->ifp;
3178
3179
3180	mtx_lock(&sc->driver_mtx);
3181	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3182		(void) mxge_open(sc);
3183	mtx_unlock(&sc->driver_mtx);
3184}
3185
3186
3187
3188static void
3189mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3190{
3191	int i;
3192
3193#if defined(INET) || defined(INET6)
3194	tcp_lro_free(&ss->lc);
3195#endif
3196	for (i = 0; i <= ss->rx_big.mask; i++) {
3197		if (ss->rx_big.info[i].m == NULL)
3198			continue;
3199		bus_dmamap_unload(ss->rx_big.dmat,
3200				  ss->rx_big.info[i].map);
3201		m_freem(ss->rx_big.info[i].m);
3202		ss->rx_big.info[i].m = NULL;
3203	}
3204
3205	for (i = 0; i <= ss->rx_small.mask; i++) {
3206		if (ss->rx_small.info[i].m == NULL)
3207			continue;
3208		bus_dmamap_unload(ss->rx_small.dmat,
3209				  ss->rx_small.info[i].map);
3210		m_freem(ss->rx_small.info[i].m);
3211		ss->rx_small.info[i].m = NULL;
3212	}
3213
3214	/* transmit ring used only on the first slice */
3215	if (ss->tx.info == NULL)
3216		return;
3217
3218	for (i = 0; i <= ss->tx.mask; i++) {
3219		ss->tx.info[i].flag = 0;
3220		if (ss->tx.info[i].m == NULL)
3221			continue;
3222		bus_dmamap_unload(ss->tx.dmat,
3223				  ss->tx.info[i].map);
3224		m_freem(ss->tx.info[i].m);
3225		ss->tx.info[i].m = NULL;
3226	}
3227}
3228
3229static void
3230mxge_free_mbufs(mxge_softc_t *sc)
3231{
3232	int slice;
3233
3234	for (slice = 0; slice < sc->num_slices; slice++)
3235		mxge_free_slice_mbufs(&sc->ss[slice]);
3236}
3237
3238static void
3239mxge_free_slice_rings(struct mxge_slice_state *ss)
3240{
3241	int i;
3242
3243
3244	if (ss->rx_done.entry != NULL)
3245		mxge_dma_free(&ss->rx_done.dma);
3246	ss->rx_done.entry = NULL;
3247
3248	if (ss->tx.req_bytes != NULL)
3249		free(ss->tx.req_bytes, M_DEVBUF);
3250	ss->tx.req_bytes = NULL;
3251
3252	if (ss->tx.seg_list != NULL)
3253		free(ss->tx.seg_list, M_DEVBUF);
3254	ss->tx.seg_list = NULL;
3255
3256	if (ss->rx_small.shadow != NULL)
3257		free(ss->rx_small.shadow, M_DEVBUF);
3258	ss->rx_small.shadow = NULL;
3259
3260	if (ss->rx_big.shadow != NULL)
3261		free(ss->rx_big.shadow, M_DEVBUF);
3262	ss->rx_big.shadow = NULL;
3263
3264	if (ss->tx.info != NULL) {
3265		if (ss->tx.dmat != NULL) {
3266			for (i = 0; i <= ss->tx.mask; i++) {
3267				bus_dmamap_destroy(ss->tx.dmat,
3268						   ss->tx.info[i].map);
3269			}
3270			bus_dma_tag_destroy(ss->tx.dmat);
3271		}
3272		free(ss->tx.info, M_DEVBUF);
3273	}
3274	ss->tx.info = NULL;
3275
3276	if (ss->rx_small.info != NULL) {
3277		if (ss->rx_small.dmat != NULL) {
3278			for (i = 0; i <= ss->rx_small.mask; i++) {
3279				bus_dmamap_destroy(ss->rx_small.dmat,
3280						   ss->rx_small.info[i].map);
3281			}
3282			bus_dmamap_destroy(ss->rx_small.dmat,
3283					   ss->rx_small.extra_map);
3284			bus_dma_tag_destroy(ss->rx_small.dmat);
3285		}
3286		free(ss->rx_small.info, M_DEVBUF);
3287	}
3288	ss->rx_small.info = NULL;
3289
3290	if (ss->rx_big.info != NULL) {
3291		if (ss->rx_big.dmat != NULL) {
3292			for (i = 0; i <= ss->rx_big.mask; i++) {
3293				bus_dmamap_destroy(ss->rx_big.dmat,
3294						   ss->rx_big.info[i].map);
3295			}
3296			bus_dmamap_destroy(ss->rx_big.dmat,
3297					   ss->rx_big.extra_map);
3298			bus_dma_tag_destroy(ss->rx_big.dmat);
3299		}
3300		free(ss->rx_big.info, M_DEVBUF);
3301	}
3302	ss->rx_big.info = NULL;
3303}
3304
3305static void
3306mxge_free_rings(mxge_softc_t *sc)
3307{
3308	int slice;
3309
3310	for (slice = 0; slice < sc->num_slices; slice++)
3311		mxge_free_slice_rings(&sc->ss[slice]);
3312}
3313
3314static int
3315mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3316		       int tx_ring_entries)
3317{
3318	mxge_softc_t *sc = ss->sc;
3319	size_t bytes;
3320	int err, i;
3321
3322	/* allocate per-slice receive resources */
3323
3324	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3325	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3326
3327	/* allocate the rx shadow rings */
3328	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3329	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3330
3331	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3332	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3333
3334	/* allocate the rx host info rings */
3335	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3336	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3337
3338	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3339	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3340
3341	/* allocate the rx busdma resources */
3342	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3343				 1,			/* alignment */
3344				 4096,			/* boundary */
3345				 BUS_SPACE_MAXADDR,	/* low */
3346				 BUS_SPACE_MAXADDR,	/* high */
3347				 NULL, NULL,		/* filter */
3348				 MHLEN,			/* maxsize */
3349				 1,			/* num segs */
3350				 MHLEN,			/* maxsegsize */
3351				 BUS_DMA_ALLOCNOW,	/* flags */
3352				 NULL, NULL,		/* lock */
3353				 &ss->rx_small.dmat);	/* tag */
3354	if (err != 0) {
3355		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3356			      err);
3357		return err;
3358	}
3359
3360	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3361				 1,			/* alignment */
3362#if MXGE_VIRT_JUMBOS
3363				 4096,			/* boundary */
3364#else
3365				 0,			/* boundary */
3366#endif
3367				 BUS_SPACE_MAXADDR,	/* low */
3368				 BUS_SPACE_MAXADDR,	/* high */
3369				 NULL, NULL,		/* filter */
3370				 3*4096,		/* maxsize */
3371#if MXGE_VIRT_JUMBOS
3372				 3,			/* num segs */
3373				 4096,			/* maxsegsize*/
3374#else
3375				 1,			/* num segs */
3376				 MJUM9BYTES,		/* maxsegsize*/
3377#endif
3378				 BUS_DMA_ALLOCNOW,	/* flags */
3379				 NULL, NULL,		/* lock */
3380				 &ss->rx_big.dmat);	/* tag */
3381	if (err != 0) {
3382		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3383			      err);
3384		return err;
3385	}
3386	for (i = 0; i <= ss->rx_small.mask; i++) {
3387		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3388					&ss->rx_small.info[i].map);
3389		if (err != 0) {
3390			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3391				      err);
3392			return err;
3393		}
3394	}
3395	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3396				&ss->rx_small.extra_map);
3397	if (err != 0) {
3398		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3399			      err);
3400		return err;
3401	}
3402
3403	for (i = 0; i <= ss->rx_big.mask; i++) {
3404		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3405					&ss->rx_big.info[i].map);
3406		if (err != 0) {
3407			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3408				      err);
3409			return err;
3410		}
3411	}
3412	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3413				&ss->rx_big.extra_map);
3414	if (err != 0) {
3415		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3416			      err);
3417		return err;
3418	}
3419
3420	/* now allocate TX resources */
3421
3422#ifndef IFNET_BUF_RING
3423	/* only use a single TX ring for now */
3424	if (ss != ss->sc->ss)
3425		return 0;
3426#endif
3427
3428	ss->tx.mask = tx_ring_entries - 1;
3429	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3430
3431
3432	/* allocate the tx request copy block */
3433	bytes = 8 +
3434		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3435	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3436	/* ensure req_list entries are aligned to 8 bytes */
3437	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3438		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3439
3440	/* allocate the tx busdma segment list */
3441	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3442	ss->tx.seg_list = (bus_dma_segment_t *)
3443		malloc(bytes, M_DEVBUF, M_WAITOK);
3444
3445	/* allocate the tx host info ring */
3446	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3447	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3448
3449	/* allocate the tx busdma resources */
3450	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3451				 1,			/* alignment */
3452				 sc->tx_boundary,	/* boundary */
3453				 BUS_SPACE_MAXADDR,	/* low */
3454				 BUS_SPACE_MAXADDR,	/* high */
3455				 NULL, NULL,		/* filter */
3456				 65536 + 256,		/* maxsize */
3457				 ss->tx.max_desc - 2,	/* num segs */
3458				 sc->tx_boundary,	/* maxsegsz */
3459				 BUS_DMA_ALLOCNOW,	/* flags */
3460				 NULL, NULL,		/* lock */
3461				 &ss->tx.dmat);		/* tag */
3462
3463	if (err != 0) {
3464		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3465			      err);
3466		return err;
3467	}
3468
3469	/* now use these tags to setup dmamaps for each slot
3470	   in the ring */
3471	for (i = 0; i <= ss->tx.mask; i++) {
3472		err = bus_dmamap_create(ss->tx.dmat, 0,
3473					&ss->tx.info[i].map);
3474		if (err != 0) {
3475			device_printf(sc->dev, "Err %d  tx dmamap\n",
3476				      err);
3477			return err;
3478		}
3479	}
3480	return 0;
3481
3482}
3483
3484static int
3485mxge_alloc_rings(mxge_softc_t *sc)
3486{
3487	mxge_cmd_t cmd;
3488	int tx_ring_size;
3489	int tx_ring_entries, rx_ring_entries;
3490	int err, slice;
3491
3492	/* get ring sizes */
3493	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3494	tx_ring_size = cmd.data0;
3495	if (err != 0) {
3496		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3497		goto abort;
3498	}
3499
3500	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3501	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3502	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3503	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3504	IFQ_SET_READY(&sc->ifp->if_snd);
3505
3506	for (slice = 0; slice < sc->num_slices; slice++) {
3507		err = mxge_alloc_slice_rings(&sc->ss[slice],
3508					     rx_ring_entries,
3509					     tx_ring_entries);
3510		if (err != 0)
3511			goto abort;
3512	}
3513	return 0;
3514
3515abort:
3516	mxge_free_rings(sc);
3517	return err;
3518
3519}
3520
3521
3522static void
3523mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3524{
3525	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3526
3527	if (bufsize < MCLBYTES) {
3528		/* easy, everything fits in a single buffer */
3529		*big_buf_size = MCLBYTES;
3530		*cl_size = MCLBYTES;
3531		*nbufs = 1;
3532		return;
3533	}
3534
3535	if (bufsize < MJUMPAGESIZE) {
3536		/* still easy, everything still fits in a single buffer */
3537		*big_buf_size = MJUMPAGESIZE;
3538		*cl_size = MJUMPAGESIZE;
3539		*nbufs = 1;
3540		return;
3541	}
3542#if MXGE_VIRT_JUMBOS
3543	/* now we need to use virtually contiguous buffers */
3544	*cl_size = MJUM9BYTES;
3545	*big_buf_size = 4096;
3546	*nbufs = mtu / 4096 + 1;
3547	/* needs to be a power of two, so round up */
3548	if (*nbufs == 3)
3549		*nbufs = 4;
3550#else
3551	*cl_size = MJUM9BYTES;
3552	*big_buf_size = MJUM9BYTES;
3553	*nbufs = 1;
3554#endif
3555}
3556
3557static int
3558mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3559{
3560	mxge_softc_t *sc;
3561	mxge_cmd_t cmd;
3562	bus_dmamap_t map;
3563	int err, i, slice;
3564
3565
3566	sc = ss->sc;
3567	slice = ss - sc->ss;
3568
3569#if defined(INET) || defined(INET6)
3570	(void)tcp_lro_init(&ss->lc);
3571#endif
3572	ss->lc.ifp = sc->ifp;
3573
3574	/* get the lanai pointers to the send and receive rings */
3575
3576	err = 0;
3577#ifndef IFNET_BUF_RING
3578	/* We currently only send from the first slice */
3579	if (slice == 0) {
3580#endif
3581		cmd.data0 = slice;
3582		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3583		ss->tx.lanai =
3584			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3585		ss->tx.send_go = (volatile uint32_t *)
3586			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3587		ss->tx.send_stop = (volatile uint32_t *)
3588		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3589#ifndef IFNET_BUF_RING
3590	}
3591#endif
3592	cmd.data0 = slice;
3593	err |= mxge_send_cmd(sc,
3594			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3595	ss->rx_small.lanai =
3596		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3597	cmd.data0 = slice;
3598	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3599	ss->rx_big.lanai =
3600		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3601
3602	if (err != 0) {
3603		device_printf(sc->dev,
3604			      "failed to get ring sizes or locations\n");
3605		return EIO;
3606	}
3607
3608	/* stock receive rings */
3609	for (i = 0; i <= ss->rx_small.mask; i++) {
3610		map = ss->rx_small.info[i].map;
3611		err = mxge_get_buf_small(ss, map, i);
3612		if (err) {
3613			device_printf(sc->dev, "alloced %d/%d smalls\n",
3614				      i, ss->rx_small.mask + 1);
3615			return ENOMEM;
3616		}
3617	}
3618	for (i = 0; i <= ss->rx_big.mask; i++) {
3619		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3620		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3621	}
3622	ss->rx_big.nbufs = nbufs;
3623	ss->rx_big.cl_size = cl_size;
3624	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3625		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3626	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3627		map = ss->rx_big.info[i].map;
3628		err = mxge_get_buf_big(ss, map, i);
3629		if (err) {
3630			device_printf(sc->dev, "alloced %d/%d bigs\n",
3631				      i, ss->rx_big.mask + 1);
3632			return ENOMEM;
3633		}
3634	}
3635	return 0;
3636}
3637
3638static int
3639mxge_open(mxge_softc_t *sc)
3640{
3641	mxge_cmd_t cmd;
3642	int err, big_bytes, nbufs, slice, cl_size, i;
3643	bus_addr_t bus;
3644	volatile uint8_t *itable;
3645	struct mxge_slice_state *ss;
3646
3647	/* Copy the MAC address in case it was overridden */
3648	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3649
3650	err = mxge_reset(sc, 1);
3651	if (err != 0) {
3652		device_printf(sc->dev, "failed to reset\n");
3653		return EIO;
3654	}
3655
3656	if (sc->num_slices > 1) {
3657		/* setup the indirection table */
3658		cmd.data0 = sc->num_slices;
3659		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3660				    &cmd);
3661
3662		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3663				     &cmd);
3664		if (err != 0) {
3665			device_printf(sc->dev,
3666				      "failed to setup rss tables\n");
3667			return err;
3668		}
3669
3670		/* just enable an identity mapping */
3671		itable = sc->sram + cmd.data0;
3672		for (i = 0; i < sc->num_slices; i++)
3673			itable[i] = (uint8_t)i;
3674
3675		cmd.data0 = 1;
3676		cmd.data1 = mxge_rss_hash_type;
3677		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3678		if (err != 0) {
3679			device_printf(sc->dev, "failed to enable slices\n");
3680			return err;
3681		}
3682	}
3683
3684
3685	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3686
3687	cmd.data0 = nbufs;
3688	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3689			    &cmd);
3690	/* error is only meaningful if we're trying to set
3691	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3692	if (err && nbufs > 1) {
3693		device_printf(sc->dev,
3694			      "Failed to set alway-use-n to %d\n",
3695			      nbufs);
3696		return EIO;
3697	}
3698	/* Give the firmware the mtu and the big and small buffer
3699	   sizes.  The firmware wants the big buf size to be a power
3700	   of two. Luckily, FreeBSD's clusters are powers of two */
3701	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3702	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3703	cmd.data0 = MHLEN - MXGEFW_PAD;
3704	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3705			     &cmd);
3706	cmd.data0 = big_bytes;
3707	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3708
3709	if (err != 0) {
3710		device_printf(sc->dev, "failed to setup params\n");
3711		goto abort;
3712	}
3713
3714	/* Now give him the pointer to the stats block */
3715	for (slice = 0;
3716#ifdef IFNET_BUF_RING
3717	     slice < sc->num_slices;
3718#else
3719	     slice < 1;
3720#endif
3721	     slice++) {
3722		ss = &sc->ss[slice];
3723		cmd.data0 =
3724			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3725		cmd.data1 =
3726			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3727		cmd.data2 = sizeof(struct mcp_irq_data);
3728		cmd.data2 |= (slice << 16);
3729		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3730	}
3731
3732	if (err != 0) {
3733		bus = sc->ss->fw_stats_dma.bus_addr;
3734		bus += offsetof(struct mcp_irq_data, send_done_count);
3735		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3736		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3737		err = mxge_send_cmd(sc,
3738				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3739				    &cmd);
3740		/* Firmware cannot support multicast without STATS_DMA_V2 */
3741		sc->fw_multicast_support = 0;
3742	} else {
3743		sc->fw_multicast_support = 1;
3744	}
3745
3746	if (err != 0) {
3747		device_printf(sc->dev, "failed to setup params\n");
3748		goto abort;
3749	}
3750
3751	for (slice = 0; slice < sc->num_slices; slice++) {
3752		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3753		if (err != 0) {
3754			device_printf(sc->dev, "couldn't open slice %d\n",
3755				      slice);
3756			goto abort;
3757		}
3758	}
3759
3760	/* Finally, start the firmware running */
3761	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3762	if (err) {
3763		device_printf(sc->dev, "Couldn't bring up link\n");
3764		goto abort;
3765	}
3766#ifdef IFNET_BUF_RING
3767	for (slice = 0; slice < sc->num_slices; slice++) {
3768		ss = &sc->ss[slice];
3769		ss->if_drv_flags |= IFF_DRV_RUNNING;
3770		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3771	}
3772#endif
3773	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3774	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3775
3776	return 0;
3777
3778
3779abort:
3780	mxge_free_mbufs(sc);
3781
3782	return err;
3783}
3784
3785static int
3786mxge_close(mxge_softc_t *sc, int down)
3787{
3788	mxge_cmd_t cmd;
3789	int err, old_down_cnt;
3790#ifdef IFNET_BUF_RING
3791	struct mxge_slice_state *ss;
3792	int slice;
3793#endif
3794
3795#ifdef IFNET_BUF_RING
3796	for (slice = 0; slice < sc->num_slices; slice++) {
3797		ss = &sc->ss[slice];
3798		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3799	}
3800#endif
3801	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3802	if (!down) {
3803		old_down_cnt = sc->down_cnt;
3804		wmb();
3805		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3806		if (err) {
3807			device_printf(sc->dev,
3808				      "Couldn't bring down link\n");
3809		}
3810		if (old_down_cnt == sc->down_cnt) {
3811			/* wait for down irq */
3812			DELAY(10 * sc->intr_coal_delay);
3813		}
3814		wmb();
3815		if (old_down_cnt == sc->down_cnt) {
3816			device_printf(sc->dev, "never got down irq\n");
3817		}
3818	}
3819	mxge_free_mbufs(sc);
3820
3821	return 0;
3822}
3823
3824static void
3825mxge_setup_cfg_space(mxge_softc_t *sc)
3826{
3827	device_t dev = sc->dev;
3828	int reg;
3829	uint16_t lnk, pectl;
3830
3831	/* find the PCIe link width and set max read request to 4KB*/
3832	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3833		lnk = pci_read_config(dev, reg + 0x12, 2);
3834		sc->link_width = (lnk >> 4) & 0x3f;
3835
3836		if (sc->pectl == 0) {
3837			pectl = pci_read_config(dev, reg + 0x8, 2);
3838			pectl = (pectl & ~0x7000) | (5 << 12);
3839			pci_write_config(dev, reg + 0x8, pectl, 2);
3840			sc->pectl = pectl;
3841		} else {
3842			/* restore saved pectl after watchdog reset */
3843			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3844		}
3845	}
3846
3847	/* Enable DMA and Memory space access */
3848	pci_enable_busmaster(dev);
3849}
3850
3851static uint32_t
3852mxge_read_reboot(mxge_softc_t *sc)
3853{
3854	device_t dev = sc->dev;
3855	uint32_t vs;
3856
3857	/* find the vendor specific offset */
3858	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3859		device_printf(sc->dev,
3860			      "could not find vendor specific offset\n");
3861		return (uint32_t)-1;
3862	}
3863	/* enable read32 mode */
3864	pci_write_config(dev, vs + 0x10, 0x3, 1);
3865	/* tell NIC which register to read */
3866	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3867	return (pci_read_config(dev, vs + 0x14, 4));
3868}
3869
3870static void
3871mxge_watchdog_reset(mxge_softc_t *sc)
3872{
3873	struct pci_devinfo *dinfo;
3874	struct mxge_slice_state *ss;
3875	int err, running, s, num_tx_slices = 1;
3876	uint32_t reboot;
3877	uint16_t cmd;
3878
3879	err = ENXIO;
3880
3881	device_printf(sc->dev, "Watchdog reset!\n");
3882
3883	/*
3884	 * check to see if the NIC rebooted.  If it did, then all of
3885	 * PCI config space has been reset, and things like the
3886	 * busmaster bit will be zero.  If this is the case, then we
3887	 * must restore PCI config space before the NIC can be used
3888	 * again
3889	 */
3890	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3891	if (cmd == 0xffff) {
3892		/*
3893		 * maybe the watchdog caught the NIC rebooting; wait
3894		 * up to 100ms for it to finish.  If it does not come
3895		 * back, then give up
3896		 */
3897		DELAY(1000*100);
3898		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3899		if (cmd == 0xffff) {
3900			device_printf(sc->dev, "NIC disappeared!\n");
3901		}
3902	}
3903	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3904		/* print the reboot status */
3905		reboot = mxge_read_reboot(sc);
3906		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3907			      reboot);
3908		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3909		if (running) {
3910
3911			/*
3912			 * quiesce NIC so that TX routines will not try to
3913			 * xmit after restoration of BAR
3914			 */
3915
3916			/* Mark the link as down */
3917			if (sc->link_state) {
3918				sc->link_state = 0;
3919				if_link_state_change(sc->ifp,
3920						     LINK_STATE_DOWN);
3921			}
3922#ifdef IFNET_BUF_RING
3923			num_tx_slices = sc->num_slices;
3924#endif
3925			/* grab all TX locks to ensure no tx  */
3926			for (s = 0; s < num_tx_slices; s++) {
3927				ss = &sc->ss[s];
3928				mtx_lock(&ss->tx.mtx);
3929			}
3930			mxge_close(sc, 1);
3931		}
3932		/* restore PCI configuration space */
3933		dinfo = device_get_ivars(sc->dev);
3934		pci_cfg_restore(sc->dev, dinfo);
3935
3936		/* and redo any changes we made to our config space */
3937		mxge_setup_cfg_space(sc);
3938
3939		/* reload f/w */
3940		err = mxge_load_firmware(sc, 0);
3941		if (err) {
3942			device_printf(sc->dev,
3943				      "Unable to re-load f/w\n");
3944		}
3945		if (running) {
3946			if (!err)
3947				err = mxge_open(sc);
3948			/* release all TX locks */
3949			for (s = 0; s < num_tx_slices; s++) {
3950				ss = &sc->ss[s];
3951#ifdef IFNET_BUF_RING
3952				mxge_start_locked(ss);
3953#endif
3954				mtx_unlock(&ss->tx.mtx);
3955			}
3956		}
3957		sc->watchdog_resets++;
3958	} else {
3959		device_printf(sc->dev,
3960			      "NIC did not reboot, not resetting\n");
3961		err = 0;
3962	}
3963	if (err) {
3964		device_printf(sc->dev, "watchdog reset failed\n");
3965	} else {
3966		if (sc->dying == 2)
3967			sc->dying = 0;
3968		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3969	}
3970}
3971
3972static void
3973mxge_watchdog_task(void *arg, int pending)
3974{
3975	mxge_softc_t *sc = arg;
3976
3977
3978	mtx_lock(&sc->driver_mtx);
3979	mxge_watchdog_reset(sc);
3980	mtx_unlock(&sc->driver_mtx);
3981}
3982
3983static void
3984mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3985{
3986	tx = &sc->ss[slice].tx;
3987	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3988	device_printf(sc->dev,
3989		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3990		      tx->req, tx->done, tx->queue_active);
3991	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3992			      tx->activate, tx->deactivate);
3993	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3994		      tx->pkt_done,
3995		      be32toh(sc->ss->fw_stats->send_done_count));
3996}
3997
3998static int
3999mxge_watchdog(mxge_softc_t *sc)
4000{
4001	mxge_tx_ring_t *tx;
4002	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
4003	int i, err = 0;
4004
4005	/* see if we have outstanding transmits, which
4006	   have been pending for more than mxge_ticks */
4007	for (i = 0;
4008#ifdef IFNET_BUF_RING
4009	     (i < sc->num_slices) && (err == 0);
4010#else
4011	     (i < 1) && (err == 0);
4012#endif
4013	     i++) {
4014		tx = &sc->ss[i].tx;
4015		if (tx->req != tx->done &&
4016		    tx->watchdog_req != tx->watchdog_done &&
4017		    tx->done == tx->watchdog_done) {
4018			/* check for pause blocking before resetting */
4019			if (tx->watchdog_rx_pause == rx_pause) {
4020				mxge_warn_stuck(sc, tx, i);
4021				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4022				return (ENXIO);
4023			}
4024			else
4025				device_printf(sc->dev, "Flow control blocking "
4026					      "xmits, check link partner\n");
4027		}
4028
4029		tx->watchdog_req = tx->req;
4030		tx->watchdog_done = tx->done;
4031		tx->watchdog_rx_pause = rx_pause;
4032	}
4033
4034	if (sc->need_media_probe)
4035		mxge_media_probe(sc);
4036	return (err);
4037}
4038
4039static uint64_t
4040mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
4041{
4042	struct mxge_softc *sc;
4043	uint64_t rv;
4044
4045	sc = if_getsoftc(ifp);
4046	rv = 0;
4047
4048	switch (cnt) {
4049	case IFCOUNTER_IPACKETS:
4050		for (int s = 0; s < sc->num_slices; s++)
4051			rv += sc->ss[s].ipackets;
4052		return (rv);
4053	case IFCOUNTER_OPACKETS:
4054		for (int s = 0; s < sc->num_slices; s++)
4055			rv += sc->ss[s].opackets;
4056		return (rv);
4057	case IFCOUNTER_OERRORS:
4058		for (int s = 0; s < sc->num_slices; s++)
4059			rv += sc->ss[s].oerrors;
4060		return (rv);
4061#ifdef IFNET_BUF_RING
4062	case IFCOUNTER_OBYTES:
4063		for (int s = 0; s < sc->num_slices; s++)
4064			rv += sc->ss[s].obytes;
4065		return (rv);
4066	case IFCOUNTER_OMCASTS:
4067		for (int s = 0; s < sc->num_slices; s++)
4068			rv += sc->ss[s].omcasts;
4069		return (rv);
4070	case IFCOUNTER_OQDROPS:
4071		for (int s = 0; s < sc->num_slices; s++)
4072			rv += sc->ss[s].tx.br->br_drops;
4073		return (rv);
4074#endif
4075	default:
4076		return (if_get_counter_default(ifp, cnt));
4077	}
4078}
4079
4080static void
4081mxge_tick(void *arg)
4082{
4083	mxge_softc_t *sc = arg;
4084	u_long pkts = 0;
4085	int err = 0;
4086	int running, ticks;
4087	uint16_t cmd;
4088
4089	ticks = mxge_ticks;
4090	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4091	if (running) {
4092		if (!sc->watchdog_countdown) {
4093			err = mxge_watchdog(sc);
4094			sc->watchdog_countdown = 4;
4095		}
4096		sc->watchdog_countdown--;
4097	}
4098	if (pkts == 0) {
4099		/* ensure NIC did not suffer h/w fault while idle */
4100		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4101		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4102			sc->dying = 2;
4103			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4104			err = ENXIO;
4105		}
4106		/* look less often if NIC is idle */
4107		ticks *= 4;
4108	}
4109
4110	if (err == 0)
4111		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4112
4113}
4114
4115static int
4116mxge_media_change(struct ifnet *ifp)
4117{
4118	return EINVAL;
4119}
4120
4121static int
4122mxge_change_mtu(mxge_softc_t *sc, int mtu)
4123{
4124	struct ifnet *ifp = sc->ifp;
4125	int real_mtu, old_mtu;
4126	int err = 0;
4127
4128
4129	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4130	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4131		return EINVAL;
4132	mtx_lock(&sc->driver_mtx);
4133	old_mtu = ifp->if_mtu;
4134	ifp->if_mtu = mtu;
4135	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4136		mxge_close(sc, 0);
4137		err = mxge_open(sc);
4138		if (err != 0) {
4139			ifp->if_mtu = old_mtu;
4140			mxge_close(sc, 0);
4141			(void) mxge_open(sc);
4142		}
4143	}
4144	mtx_unlock(&sc->driver_mtx);
4145	return err;
4146}
4147
4148static void
4149mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4150{
4151	mxge_softc_t *sc = ifp->if_softc;
4152
4153
4154	if (sc == NULL)
4155		return;
4156	ifmr->ifm_status = IFM_AVALID;
4157	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4158	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4159	ifmr->ifm_active |= sc->current_media;
4160}
4161
4162static int
4163mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4164{
4165	mxge_softc_t *sc = ifp->if_softc;
4166	struct ifreq *ifr = (struct ifreq *)data;
4167	int err, mask;
4168
4169	err = 0;
4170	switch (command) {
4171	case SIOCSIFADDR:
4172	case SIOCGIFADDR:
4173		err = ether_ioctl(ifp, command, data);
4174		break;
4175
4176	case SIOCSIFMTU:
4177		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4178		break;
4179
4180	case SIOCSIFFLAGS:
4181		mtx_lock(&sc->driver_mtx);
4182		if (sc->dying) {
4183			mtx_unlock(&sc->driver_mtx);
4184			return EINVAL;
4185		}
4186		if (ifp->if_flags & IFF_UP) {
4187			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4188				err = mxge_open(sc);
4189			} else {
4190				/* take care of promis can allmulti
4191				   flag chages */
4192				mxge_change_promisc(sc,
4193						    ifp->if_flags & IFF_PROMISC);
4194				mxge_set_multicast_list(sc);
4195			}
4196		} else {
4197			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4198				mxge_close(sc, 0);
4199			}
4200		}
4201		mtx_unlock(&sc->driver_mtx);
4202		break;
4203
4204	case SIOCADDMULTI:
4205	case SIOCDELMULTI:
4206		mtx_lock(&sc->driver_mtx);
4207		mxge_set_multicast_list(sc);
4208		mtx_unlock(&sc->driver_mtx);
4209		break;
4210
4211	case SIOCSIFCAP:
4212		mtx_lock(&sc->driver_mtx);
4213		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4214		if (mask & IFCAP_TXCSUM) {
4215			if (IFCAP_TXCSUM & ifp->if_capenable) {
4216				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4217				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4218			} else {
4219				ifp->if_capenable |= IFCAP_TXCSUM;
4220				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4221			}
4222		} else if (mask & IFCAP_RXCSUM) {
4223			if (IFCAP_RXCSUM & ifp->if_capenable) {
4224				ifp->if_capenable &= ~IFCAP_RXCSUM;
4225			} else {
4226				ifp->if_capenable |= IFCAP_RXCSUM;
4227			}
4228		}
4229		if (mask & IFCAP_TSO4) {
4230			if (IFCAP_TSO4 & ifp->if_capenable) {
4231				ifp->if_capenable &= ~IFCAP_TSO4;
4232			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4233				ifp->if_capenable |= IFCAP_TSO4;
4234				ifp->if_hwassist |= CSUM_TSO;
4235			} else {
4236				printf("mxge requires tx checksum offload"
4237				       " be enabled to use TSO\n");
4238				err = EINVAL;
4239			}
4240		}
4241#if IFCAP_TSO6
4242		if (mask & IFCAP_TXCSUM_IPV6) {
4243			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4244				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4245						       | IFCAP_TSO6);
4246				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4247						      | CSUM_UDP);
4248			} else {
4249				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4250				ifp->if_hwassist |= (CSUM_TCP_IPV6
4251						     | CSUM_UDP_IPV6);
4252			}
4253		} else if (mask & IFCAP_RXCSUM_IPV6) {
4254			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4255				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4256			} else {
4257				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4258			}
4259		}
4260		if (mask & IFCAP_TSO6) {
4261			if (IFCAP_TSO6 & ifp->if_capenable) {
4262				ifp->if_capenable &= ~IFCAP_TSO6;
4263			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4264				ifp->if_capenable |= IFCAP_TSO6;
4265				ifp->if_hwassist |= CSUM_TSO;
4266			} else {
4267				printf("mxge requires tx checksum offload"
4268				       " be enabled to use TSO\n");
4269				err = EINVAL;
4270			}
4271		}
4272#endif /*IFCAP_TSO6 */
4273
4274		if (mask & IFCAP_LRO)
4275			ifp->if_capenable ^= IFCAP_LRO;
4276		if (mask & IFCAP_VLAN_HWTAGGING)
4277			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4278		if (mask & IFCAP_VLAN_HWTSO)
4279			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4280
4281		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4282		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4283			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4284
4285		mtx_unlock(&sc->driver_mtx);
4286		VLAN_CAPABILITIES(ifp);
4287
4288		break;
4289
4290	case SIOCGIFMEDIA:
4291		mtx_lock(&sc->driver_mtx);
4292		mxge_media_probe(sc);
4293		mtx_unlock(&sc->driver_mtx);
4294		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4295				    &sc->media, command);
4296		break;
4297
4298	default:
4299		err = ENOTTY;
4300	}
4301	return err;
4302}
4303
4304static void
4305mxge_fetch_tunables(mxge_softc_t *sc)
4306{
4307
4308	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4309	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4310			  &mxge_flow_control);
4311	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4312			  &mxge_intr_coal_delay);
4313	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4314			  &mxge_nvidia_ecrc_enable);
4315	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4316			  &mxge_force_firmware);
4317	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4318			  &mxge_deassert_wait);
4319	TUNABLE_INT_FETCH("hw.mxge.verbose",
4320			  &mxge_verbose);
4321	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4322	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4323	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4324	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4325	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4326	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4327
4328	if (bootverbose)
4329		mxge_verbose = 1;
4330	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4331		mxge_intr_coal_delay = 30;
4332	if (mxge_ticks == 0)
4333		mxge_ticks = hz / 2;
4334	sc->pause = mxge_flow_control;
4335	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4336	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4337		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4338	}
4339	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4340	    mxge_initial_mtu < ETHER_MIN_LEN)
4341		mxge_initial_mtu = ETHERMTU_JUMBO;
4342
4343	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4344		mxge_throttle = MXGE_MAX_THROTTLE;
4345	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4346		mxge_throttle = MXGE_MIN_THROTTLE;
4347	sc->throttle = mxge_throttle;
4348}
4349
4350
4351static void
4352mxge_free_slices(mxge_softc_t *sc)
4353{
4354	struct mxge_slice_state *ss;
4355	int i;
4356
4357
4358	if (sc->ss == NULL)
4359		return;
4360
4361	for (i = 0; i < sc->num_slices; i++) {
4362		ss = &sc->ss[i];
4363		if (ss->fw_stats != NULL) {
4364			mxge_dma_free(&ss->fw_stats_dma);
4365			ss->fw_stats = NULL;
4366#ifdef IFNET_BUF_RING
4367			if (ss->tx.br != NULL) {
4368				drbr_free(ss->tx.br, M_DEVBUF);
4369				ss->tx.br = NULL;
4370			}
4371#endif
4372			mtx_destroy(&ss->tx.mtx);
4373		}
4374		if (ss->rx_done.entry != NULL) {
4375			mxge_dma_free(&ss->rx_done.dma);
4376			ss->rx_done.entry = NULL;
4377		}
4378	}
4379	free(sc->ss, M_DEVBUF);
4380	sc->ss = NULL;
4381}
4382
4383static int
4384mxge_alloc_slices(mxge_softc_t *sc)
4385{
4386	mxge_cmd_t cmd;
4387	struct mxge_slice_state *ss;
4388	size_t bytes;
4389	int err, i, max_intr_slots;
4390
4391	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4392	if (err != 0) {
4393		device_printf(sc->dev, "Cannot determine rx ring size\n");
4394		return err;
4395	}
4396	sc->rx_ring_size = cmd.data0;
4397	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4398
4399	bytes = sizeof (*sc->ss) * sc->num_slices;
4400	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4401	if (sc->ss == NULL)
4402		return (ENOMEM);
4403	for (i = 0; i < sc->num_slices; i++) {
4404		ss = &sc->ss[i];
4405
4406		ss->sc = sc;
4407
4408		/* allocate per-slice rx interrupt queues */
4409
4410		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4411		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4412		if (err != 0)
4413			goto abort;
4414		ss->rx_done.entry = ss->rx_done.dma.addr;
4415		bzero(ss->rx_done.entry, bytes);
4416
4417		/*
4418		 * allocate the per-slice firmware stats; stats
4419		 * (including tx) are used used only on the first
4420		 * slice for now
4421		 */
4422#ifndef IFNET_BUF_RING
4423		if (i > 0)
4424			continue;
4425#endif
4426
4427		bytes = sizeof (*ss->fw_stats);
4428		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4429				     sizeof (*ss->fw_stats), 64);
4430		if (err != 0)
4431			goto abort;
4432		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4433		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4434			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4435		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4436#ifdef IFNET_BUF_RING
4437		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4438					   &ss->tx.mtx);
4439#endif
4440	}
4441
4442	return (0);
4443
4444abort:
4445	mxge_free_slices(sc);
4446	return (ENOMEM);
4447}
4448
4449static void
4450mxge_slice_probe(mxge_softc_t *sc)
4451{
4452	mxge_cmd_t cmd;
4453	char *old_fw;
4454	int msix_cnt, status, max_intr_slots;
4455
4456	sc->num_slices = 1;
4457	/*
4458	 *  don't enable multiple slices if they are not enabled,
4459	 *  or if this is not an SMP system
4460	 */
4461
4462	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4463		return;
4464
4465	/* see how many MSI-X interrupts are available */
4466	msix_cnt = pci_msix_count(sc->dev);
4467	if (msix_cnt < 2)
4468		return;
4469
4470	/* now load the slice aware firmware see what it supports */
4471	old_fw = sc->fw_name;
4472	if (old_fw == mxge_fw_aligned)
4473		sc->fw_name = mxge_fw_rss_aligned;
4474	else
4475		sc->fw_name = mxge_fw_rss_unaligned;
4476	status = mxge_load_firmware(sc, 0);
4477	if (status != 0) {
4478		device_printf(sc->dev, "Falling back to a single slice\n");
4479		return;
4480	}
4481
4482	/* try to send a reset command to the card to see if it
4483	   is alive */
4484	memset(&cmd, 0, sizeof (cmd));
4485	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4486	if (status != 0) {
4487		device_printf(sc->dev, "failed reset\n");
4488		goto abort_with_fw;
4489	}
4490
4491	/* get rx ring size */
4492	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4493	if (status != 0) {
4494		device_printf(sc->dev, "Cannot determine rx ring size\n");
4495		goto abort_with_fw;
4496	}
4497	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4498
4499	/* tell it the size of the interrupt queues */
4500	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4501	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4502	if (status != 0) {
4503		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4504		goto abort_with_fw;
4505	}
4506
4507	/* ask the maximum number of slices it supports */
4508	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4509	if (status != 0) {
4510		device_printf(sc->dev,
4511			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4512		goto abort_with_fw;
4513	}
4514	sc->num_slices = cmd.data0;
4515	if (sc->num_slices > msix_cnt)
4516		sc->num_slices = msix_cnt;
4517
4518	if (mxge_max_slices == -1) {
4519		/* cap to number of CPUs in system */
4520		if (sc->num_slices > mp_ncpus)
4521			sc->num_slices = mp_ncpus;
4522	} else {
4523		if (sc->num_slices > mxge_max_slices)
4524			sc->num_slices = mxge_max_slices;
4525	}
4526	/* make sure it is a power of two */
4527	while (sc->num_slices & (sc->num_slices - 1))
4528		sc->num_slices--;
4529
4530	if (mxge_verbose)
4531		device_printf(sc->dev, "using %d slices\n",
4532			      sc->num_slices);
4533
4534	return;
4535
4536abort_with_fw:
4537	sc->fw_name = old_fw;
4538	(void) mxge_load_firmware(sc, 0);
4539}
4540
4541static int
4542mxge_add_msix_irqs(mxge_softc_t *sc)
4543{
4544	size_t bytes;
4545	int count, err, i, rid;
4546
4547	rid = PCIR_BAR(2);
4548	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4549						    &rid, RF_ACTIVE);
4550
4551	if (sc->msix_table_res == NULL) {
4552		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4553		return ENXIO;
4554	}
4555
4556	count = sc->num_slices;
4557	err = pci_alloc_msix(sc->dev, &count);
4558	if (err != 0) {
4559		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4560			      "err = %d \n", sc->num_slices, err);
4561		goto abort_with_msix_table;
4562	}
4563	if (count < sc->num_slices) {
4564		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4565			      count, sc->num_slices);
4566		device_printf(sc->dev,
4567			      "Try setting hw.mxge.max_slices to %d\n",
4568			      count);
4569		err = ENOSPC;
4570		goto abort_with_msix;
4571	}
4572	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4573	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4574	if (sc->msix_irq_res == NULL) {
4575		err = ENOMEM;
4576		goto abort_with_msix;
4577	}
4578
4579	for (i = 0; i < sc->num_slices; i++) {
4580		rid = i + 1;
4581		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4582							  SYS_RES_IRQ,
4583							  &rid, RF_ACTIVE);
4584		if (sc->msix_irq_res[i] == NULL) {
4585			device_printf(sc->dev, "couldn't allocate IRQ res"
4586				      " for message %d\n", i);
4587			err = ENXIO;
4588			goto abort_with_res;
4589		}
4590	}
4591
4592	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4593	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4594
4595	for (i = 0; i < sc->num_slices; i++) {
4596		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4597				     INTR_TYPE_NET | INTR_MPSAFE,
4598#if __FreeBSD_version > 700030
4599				     NULL,
4600#endif
4601				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4602		if (err != 0) {
4603			device_printf(sc->dev, "couldn't setup intr for "
4604				      "message %d\n", i);
4605			goto abort_with_intr;
4606		}
4607		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4608				  sc->msix_ih[i], "s%d", i);
4609	}
4610
4611	if (mxge_verbose) {
4612		device_printf(sc->dev, "using %d msix IRQs:",
4613			      sc->num_slices);
4614		for (i = 0; i < sc->num_slices; i++)
4615			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4616		printf("\n");
4617	}
4618	return (0);
4619
4620abort_with_intr:
4621	for (i = 0; i < sc->num_slices; i++) {
4622		if (sc->msix_ih[i] != NULL) {
4623			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4624					  sc->msix_ih[i]);
4625			sc->msix_ih[i] = NULL;
4626		}
4627	}
4628	free(sc->msix_ih, M_DEVBUF);
4629
4630
4631abort_with_res:
4632	for (i = 0; i < sc->num_slices; i++) {
4633		rid = i + 1;
4634		if (sc->msix_irq_res[i] != NULL)
4635			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4636					     sc->msix_irq_res[i]);
4637		sc->msix_irq_res[i] = NULL;
4638	}
4639	free(sc->msix_irq_res, M_DEVBUF);
4640
4641
4642abort_with_msix:
4643	pci_release_msi(sc->dev);
4644
4645abort_with_msix_table:
4646	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4647			     sc->msix_table_res);
4648
4649	return err;
4650}
4651
4652static int
4653mxge_add_single_irq(mxge_softc_t *sc)
4654{
4655	int count, err, rid;
4656
4657	count = pci_msi_count(sc->dev);
4658	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4659		rid = 1;
4660	} else {
4661		rid = 0;
4662		sc->legacy_irq = 1;
4663	}
4664	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4665					 1, RF_SHAREABLE | RF_ACTIVE);
4666	if (sc->irq_res == NULL) {
4667		device_printf(sc->dev, "could not alloc interrupt\n");
4668		return ENXIO;
4669	}
4670	if (mxge_verbose)
4671		device_printf(sc->dev, "using %s irq %ld\n",
4672			      sc->legacy_irq ? "INTx" : "MSI",
4673			      rman_get_start(sc->irq_res));
4674	err = bus_setup_intr(sc->dev, sc->irq_res,
4675			     INTR_TYPE_NET | INTR_MPSAFE,
4676#if __FreeBSD_version > 700030
4677			     NULL,
4678#endif
4679			     mxge_intr, &sc->ss[0], &sc->ih);
4680	if (err != 0) {
4681		bus_release_resource(sc->dev, SYS_RES_IRQ,
4682				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4683		if (!sc->legacy_irq)
4684			pci_release_msi(sc->dev);
4685	}
4686	return err;
4687}
4688
4689static void
4690mxge_rem_msix_irqs(mxge_softc_t *sc)
4691{
4692	int i, rid;
4693
4694	for (i = 0; i < sc->num_slices; i++) {
4695		if (sc->msix_ih[i] != NULL) {
4696			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4697					  sc->msix_ih[i]);
4698			sc->msix_ih[i] = NULL;
4699		}
4700	}
4701	free(sc->msix_ih, M_DEVBUF);
4702
4703	for (i = 0; i < sc->num_slices; i++) {
4704		rid = i + 1;
4705		if (sc->msix_irq_res[i] != NULL)
4706			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4707					     sc->msix_irq_res[i]);
4708		sc->msix_irq_res[i] = NULL;
4709	}
4710	free(sc->msix_irq_res, M_DEVBUF);
4711
4712	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4713			     sc->msix_table_res);
4714
4715	pci_release_msi(sc->dev);
4716	return;
4717}
4718
4719static void
4720mxge_rem_single_irq(mxge_softc_t *sc)
4721{
4722	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4723	bus_release_resource(sc->dev, SYS_RES_IRQ,
4724			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4725	if (!sc->legacy_irq)
4726		pci_release_msi(sc->dev);
4727}
4728
4729static void
4730mxge_rem_irq(mxge_softc_t *sc)
4731{
4732	if (sc->num_slices > 1)
4733		mxge_rem_msix_irqs(sc);
4734	else
4735		mxge_rem_single_irq(sc);
4736}
4737
4738static int
4739mxge_add_irq(mxge_softc_t *sc)
4740{
4741	int err;
4742
4743	if (sc->num_slices > 1)
4744		err = mxge_add_msix_irqs(sc);
4745	else
4746		err = mxge_add_single_irq(sc);
4747
4748	if (0 && err == 0 && sc->num_slices > 1) {
4749		mxge_rem_msix_irqs(sc);
4750		err = mxge_add_msix_irqs(sc);
4751	}
4752	return err;
4753}
4754
4755
4756static int
4757mxge_attach(device_t dev)
4758{
4759	mxge_cmd_t cmd;
4760	mxge_softc_t *sc = device_get_softc(dev);
4761	struct ifnet *ifp;
4762	int err, rid;
4763
4764	sc->dev = dev;
4765	mxge_fetch_tunables(sc);
4766
4767	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4768	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4769				  taskqueue_thread_enqueue, &sc->tq);
4770	if (sc->tq == NULL) {
4771		err = ENOMEM;
4772		goto abort_with_nothing;
4773	}
4774
4775	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4776				 1,			/* alignment */
4777				 0,			/* boundary */
4778				 BUS_SPACE_MAXADDR,	/* low */
4779				 BUS_SPACE_MAXADDR,	/* high */
4780				 NULL, NULL,		/* filter */
4781				 65536 + 256,		/* maxsize */
4782				 MXGE_MAX_SEND_DESC, 	/* num segs */
4783				 65536,			/* maxsegsize */
4784				 0,			/* flags */
4785				 NULL, NULL,		/* lock */
4786				 &sc->parent_dmat);	/* tag */
4787
4788	if (err != 0) {
4789		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4790			      err);
4791		goto abort_with_tq;
4792	}
4793
4794	ifp = sc->ifp = if_alloc(IFT_ETHER);
4795	if (ifp == NULL) {
4796		device_printf(dev, "can not if_alloc()\n");
4797		err = ENOSPC;
4798		goto abort_with_parent_dmat;
4799	}
4800	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4801
4802	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4803		 device_get_nameunit(dev));
4804	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4805	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4806		 "%s:drv", device_get_nameunit(dev));
4807	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4808		 MTX_NETWORK_LOCK, MTX_DEF);
4809
4810	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4811
4812	mxge_setup_cfg_space(sc);
4813
4814	/* Map the board into the kernel */
4815	rid = PCIR_BARS;
4816	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4817					 ~0, 1, RF_ACTIVE);
4818	if (sc->mem_res == NULL) {
4819		device_printf(dev, "could not map memory\n");
4820		err = ENXIO;
4821		goto abort_with_lock;
4822	}
4823	sc->sram = rman_get_virtual(sc->mem_res);
4824	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4825	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4826		device_printf(dev, "impossible memory region size %ld\n",
4827			      rman_get_size(sc->mem_res));
4828		err = ENXIO;
4829		goto abort_with_mem_res;
4830	}
4831
4832	/* make NULL terminated copy of the EEPROM strings section of
4833	   lanai SRAM */
4834	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4835	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4836				rman_get_bushandle(sc->mem_res),
4837				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4838				sc->eeprom_strings,
4839				MXGE_EEPROM_STRINGS_SIZE - 2);
4840	err = mxge_parse_strings(sc);
4841	if (err != 0)
4842		goto abort_with_mem_res;
4843
4844	/* Enable write combining for efficient use of PCIe bus */
4845	mxge_enable_wc(sc);
4846
4847	/* Allocate the out of band dma memory */
4848	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4849			     sizeof (mxge_cmd_t), 64);
4850	if (err != 0)
4851		goto abort_with_mem_res;
4852	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4853	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4854	if (err != 0)
4855		goto abort_with_cmd_dma;
4856
4857	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4858	if (err != 0)
4859		goto abort_with_zeropad_dma;
4860
4861	/* select & load the firmware */
4862	err = mxge_select_firmware(sc);
4863	if (err != 0)
4864		goto abort_with_dmabench;
4865	sc->intr_coal_delay = mxge_intr_coal_delay;
4866
4867	mxge_slice_probe(sc);
4868	err = mxge_alloc_slices(sc);
4869	if (err != 0)
4870		goto abort_with_dmabench;
4871
4872	err = mxge_reset(sc, 0);
4873	if (err != 0)
4874		goto abort_with_slices;
4875
4876	err = mxge_alloc_rings(sc);
4877	if (err != 0) {
4878		device_printf(sc->dev, "failed to allocate rings\n");
4879		goto abort_with_slices;
4880	}
4881
4882	err = mxge_add_irq(sc);
4883	if (err != 0) {
4884		device_printf(sc->dev, "failed to add irq\n");
4885		goto abort_with_rings;
4886	}
4887
4888	ifp->if_baudrate = IF_Gbps(10);
4889	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4890		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4891		IFCAP_RXCSUM_IPV6;
4892#if defined(INET) || defined(INET6)
4893	ifp->if_capabilities |= IFCAP_LRO;
4894#endif
4895
4896#ifdef MXGE_NEW_VLAN_API
4897	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4898
4899	/* Only FW 1.4.32 and newer can do TSO over vlans */
4900	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4901	    sc->fw_ver_tiny >= 32)
4902		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4903#endif
4904	sc->max_mtu = mxge_max_mtu(sc);
4905	if (sc->max_mtu >= 9000)
4906		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4907	else
4908		device_printf(dev, "MTU limited to %d.  Install "
4909			      "latest firmware for 9000 byte jumbo support\n",
4910			      sc->max_mtu - ETHER_HDR_LEN);
4911	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4912	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4913	/* check to see if f/w supports TSO for IPv6 */
4914	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4915		if (CSUM_TCP_IPV6)
4916			ifp->if_capabilities |= IFCAP_TSO6;
4917		sc->max_tso6_hlen = min(cmd.data0,
4918					sizeof (sc->ss[0].scratch));
4919	}
4920	ifp->if_capenable = ifp->if_capabilities;
4921	if (sc->lro_cnt == 0)
4922		ifp->if_capenable &= ~IFCAP_LRO;
4923	ifp->if_init = mxge_init;
4924	ifp->if_softc = sc;
4925	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4926	ifp->if_ioctl = mxge_ioctl;
4927	ifp->if_start = mxge_start;
4928	ifp->if_get_counter = mxge_get_counter;
4929	/* Initialise the ifmedia structure */
4930	ifmedia_init(&sc->media, 0, mxge_media_change,
4931		     mxge_media_status);
4932	mxge_media_init(sc);
4933	mxge_media_probe(sc);
4934	sc->dying = 0;
4935	ether_ifattach(ifp, sc->mac_addr);
4936	/* ether_ifattach sets mtu to ETHERMTU */
4937	if (mxge_initial_mtu != ETHERMTU)
4938		mxge_change_mtu(sc, mxge_initial_mtu);
4939
4940	mxge_add_sysctls(sc);
4941#ifdef IFNET_BUF_RING
4942	ifp->if_transmit = mxge_transmit;
4943	ifp->if_qflush = mxge_qflush;
4944#endif
4945	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4946				device_get_nameunit(sc->dev));
4947	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4948	return 0;
4949
4950abort_with_rings:
4951	mxge_free_rings(sc);
4952abort_with_slices:
4953	mxge_free_slices(sc);
4954abort_with_dmabench:
4955	mxge_dma_free(&sc->dmabench_dma);
4956abort_with_zeropad_dma:
4957	mxge_dma_free(&sc->zeropad_dma);
4958abort_with_cmd_dma:
4959	mxge_dma_free(&sc->cmd_dma);
4960abort_with_mem_res:
4961	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4962abort_with_lock:
4963	pci_disable_busmaster(dev);
4964	mtx_destroy(&sc->cmd_mtx);
4965	mtx_destroy(&sc->driver_mtx);
4966	if_free(ifp);
4967abort_with_parent_dmat:
4968	bus_dma_tag_destroy(sc->parent_dmat);
4969abort_with_tq:
4970	if (sc->tq != NULL) {
4971		taskqueue_drain(sc->tq, &sc->watchdog_task);
4972		taskqueue_free(sc->tq);
4973		sc->tq = NULL;
4974	}
4975abort_with_nothing:
4976	return err;
4977}
4978
4979static int
4980mxge_detach(device_t dev)
4981{
4982	mxge_softc_t *sc = device_get_softc(dev);
4983
4984	if (mxge_vlans_active(sc)) {
4985		device_printf(sc->dev,
4986			      "Detach vlans before removing module\n");
4987		return EBUSY;
4988	}
4989	mtx_lock(&sc->driver_mtx);
4990	sc->dying = 1;
4991	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4992		mxge_close(sc, 0);
4993	mtx_unlock(&sc->driver_mtx);
4994	ether_ifdetach(sc->ifp);
4995	if (sc->tq != NULL) {
4996		taskqueue_drain(sc->tq, &sc->watchdog_task);
4997		taskqueue_free(sc->tq);
4998		sc->tq = NULL;
4999	}
5000	callout_drain(&sc->co_hdl);
5001	ifmedia_removeall(&sc->media);
5002	mxge_dummy_rdma(sc, 0);
5003	mxge_rem_sysctls(sc);
5004	mxge_rem_irq(sc);
5005	mxge_free_rings(sc);
5006	mxge_free_slices(sc);
5007	mxge_dma_free(&sc->dmabench_dma);
5008	mxge_dma_free(&sc->zeropad_dma);
5009	mxge_dma_free(&sc->cmd_dma);
5010	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5011	pci_disable_busmaster(dev);
5012	mtx_destroy(&sc->cmd_mtx);
5013	mtx_destroy(&sc->driver_mtx);
5014	if_free(sc->ifp);
5015	bus_dma_tag_destroy(sc->parent_dmat);
5016	return 0;
5017}
5018
5019static int
5020mxge_shutdown(device_t dev)
5021{
5022	return 0;
5023}
5024
5025/*
5026  This file uses Myri10GE driver indentation.
5027
5028  Local Variables:
5029  c-file-style:"linux"
5030  tab-width:8
5031  End:
5032*/
5033