1/******************************************************************************
2
3Copyright (c) 2006-2013, Myricom Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Myricom Inc, nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD$");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/linker.h>
36#include <sys/firmware.h>
37#include <sys/endian.h>
38#include <sys/sockio.h>
39#include <sys/mbuf.h>
40#include <sys/malloc.h>
41#include <sys/kdb.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/module.h>
45#include <sys/socket.h>
46#include <sys/sysctl.h>
47#include <sys/sx.h>
48#include <sys/taskqueue.h>
49
50#include <net/if.h>
51#include <net/if_arp.h>
52#include <net/ethernet.h>
53#include <net/if_dl.h>
54#include <net/if_media.h>
55
56#include <net/bpf.h>
57
58#include <net/if_types.h>
59#include <net/if_vlan_var.h>
60#include <net/zlib.h>
61
62#include <netinet/in_systm.h>
63#include <netinet/in.h>
64#include <netinet/ip.h>
65#include <netinet/ip6.h>
66#include <netinet/tcp.h>
67#include <netinet/tcp_lro.h>
68#include <netinet6/ip6_var.h>
69
70#include <machine/bus.h>
71#include <machine/in_cksum.h>
72#include <machine/resource.h>
73#include <sys/bus.h>
74#include <sys/rman.h>
75#include <sys/smp.h>
76
77#include <dev/pci/pcireg.h>
78#include <dev/pci/pcivar.h>
79#include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
80
81#include <vm/vm.h>		/* for pmap_mapdev() */
82#include <vm/pmap.h>
83
84#if defined(__i386) || defined(__amd64)
85#include <machine/specialreg.h>
86#endif
87
88#include <dev/mxge/mxge_mcp.h>
89#include <dev/mxge/mcp_gen_header.h>
90/*#define MXGE_FAKE_IFP*/
91#include <dev/mxge/if_mxge_var.h>
92#ifdef IFNET_BUF_RING
93#include <sys/buf_ring.h>
94#endif
95
96#include "opt_inet.h"
97#include "opt_inet6.h"
98
99/* tunable params */
100static int mxge_nvidia_ecrc_enable = 1;
101static int mxge_force_firmware = 0;
102static int mxge_intr_coal_delay = 30;
103static int mxge_deassert_wait = 1;
104static int mxge_flow_control = 1;
105static int mxge_verbose = 0;
106static int mxge_ticks;
107static int mxge_max_slices = 1;
108static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
109static int mxge_always_promisc = 0;
110static int mxge_initial_mtu = ETHERMTU_JUMBO;
111static int mxge_throttle = 0;
112static char *mxge_fw_unaligned = "mxge_ethp_z8e";
113static char *mxge_fw_aligned = "mxge_eth_z8e";
114static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
115static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
116
117static int mxge_probe(device_t dev);
118static int mxge_attach(device_t dev);
119static int mxge_detach(device_t dev);
120static int mxge_shutdown(device_t dev);
121static void mxge_intr(void *arg);
122
123static device_method_t mxge_methods[] =
124{
125  /* Device interface */
126  DEVMETHOD(device_probe, mxge_probe),
127  DEVMETHOD(device_attach, mxge_attach),
128  DEVMETHOD(device_detach, mxge_detach),
129  DEVMETHOD(device_shutdown, mxge_shutdown),
130  {0, 0}
131};
132
133static driver_t mxge_driver =
134{
135  "mxge",
136  mxge_methods,
137  sizeof(mxge_softc_t),
138};
139
140static devclass_t mxge_devclass;
141
142/* Declare ourselves to be a child of the PCI bus.*/
143DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
144MODULE_DEPEND(mxge, firmware, 1, 1, 1);
145MODULE_DEPEND(mxge, zlib, 1, 1, 1);
146
147static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
148static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
149static int mxge_close(mxge_softc_t *sc, int down);
150static int mxge_open(mxge_softc_t *sc);
151static void mxge_tick(void *arg);
152
153static int
154mxge_probe(device_t dev)
155{
156	int rev;
157
158
159	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
160	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
161	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
162		rev = pci_get_revid(dev);
163		switch (rev) {
164		case MXGE_PCI_REV_Z8E:
165			device_set_desc(dev, "Myri10G-PCIE-8A");
166			break;
167		case MXGE_PCI_REV_Z8ES:
168			device_set_desc(dev, "Myri10G-PCIE-8B");
169			break;
170		default:
171			device_set_desc(dev, "Myri10G-PCIE-8??");
172			device_printf(dev, "Unrecognized rev %d NIC\n",
173				      rev);
174			break;
175		}
176		return 0;
177	}
178	return ENXIO;
179}
180
181static void
182mxge_enable_wc(mxge_softc_t *sc)
183{
184#if defined(__i386) || defined(__amd64)
185	vm_offset_t len;
186	int err;
187
188	sc->wc = 1;
189	len = rman_get_size(sc->mem_res);
190	err = pmap_change_attr((vm_offset_t) sc->sram,
191			       len, PAT_WRITE_COMBINING);
192	if (err != 0) {
193		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
194			      err);
195		sc->wc = 0;
196	}
197#endif
198}
199
200
201/* callback to get our DMA address */
202static void
203mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
204			 int error)
205{
206	if (error == 0) {
207		*(bus_addr_t *) arg = segs->ds_addr;
208	}
209}
210
211static int
212mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
213		   bus_size_t alignment)
214{
215	int err;
216	device_t dev = sc->dev;
217	bus_size_t boundary, maxsegsize;
218
219	if (bytes > 4096 && alignment == 4096) {
220		boundary = 0;
221		maxsegsize = bytes;
222	} else {
223		boundary = 4096;
224		maxsegsize = 4096;
225	}
226
227	/* allocate DMAable memory tags */
228	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
229				 alignment,		/* alignment */
230				 boundary,		/* boundary */
231				 BUS_SPACE_MAXADDR,	/* low */
232				 BUS_SPACE_MAXADDR,	/* high */
233				 NULL, NULL,		/* filter */
234				 bytes,			/* maxsize */
235				 1,			/* num segs */
236				 maxsegsize,		/* maxsegsize */
237				 BUS_DMA_COHERENT,	/* flags */
238				 NULL, NULL,		/* lock */
239				 &dma->dmat);		/* tag */
240	if (err != 0) {
241		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
242		return err;
243	}
244
245	/* allocate DMAable memory & map */
246	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
247			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
248				| BUS_DMA_ZERO),  &dma->map);
249	if (err != 0) {
250		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
251		goto abort_with_dmat;
252	}
253
254	/* load the memory */
255	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
256			      mxge_dmamap_callback,
257			      (void *)&dma->bus_addr, 0);
258	if (err != 0) {
259		device_printf(dev, "couldn't load map (err = %d)\n", err);
260		goto abort_with_mem;
261	}
262	return 0;
263
264abort_with_mem:
265	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
266abort_with_dmat:
267	(void)bus_dma_tag_destroy(dma->dmat);
268	return err;
269}
270
271
272static void
273mxge_dma_free(mxge_dma_t *dma)
274{
275	bus_dmamap_unload(dma->dmat, dma->map);
276	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
277	(void)bus_dma_tag_destroy(dma->dmat);
278}
279
280/*
281 * The eeprom strings on the lanaiX have the format
282 * SN=x\0
283 * MAC=x:x:x:x:x:x\0
284 * PC=text\0
285 */
286
287static int
288mxge_parse_strings(mxge_softc_t *sc)
289{
290	char *ptr;
291	int i, found_mac, found_sn2;
292	char *endptr;
293
294	ptr = sc->eeprom_strings;
295	found_mac = 0;
296	found_sn2 = 0;
297	while (*ptr != '\0') {
298		if (strncmp(ptr, "MAC=", 4) == 0) {
299			ptr += 4;
300			for (i = 0;;) {
301				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
302				if (endptr - ptr != 2)
303					goto abort;
304				ptr = endptr;
305				if (++i == 6)
306					break;
307				if (*ptr++ != ':')
308					goto abort;
309			}
310			found_mac = 1;
311		} else if (strncmp(ptr, "PC=", 3) == 0) {
312			ptr += 3;
313			strlcpy(sc->product_code_string, ptr,
314			    sizeof(sc->product_code_string));
315		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
316			ptr += 3;
317			strlcpy(sc->serial_number_string, ptr,
318			    sizeof(sc->serial_number_string));
319		} else if (strncmp(ptr, "SN2=", 4) == 0) {
320			/* SN2 takes precedence over SN */
321			ptr += 4;
322			found_sn2 = 1;
323			strlcpy(sc->serial_number_string, ptr,
324			    sizeof(sc->serial_number_string));
325		}
326		while (*ptr++ != '\0') {}
327	}
328
329	if (found_mac)
330		return 0;
331
332 abort:
333	device_printf(sc->dev, "failed to parse eeprom_strings\n");
334
335	return ENXIO;
336}
337
338#if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
339static void
340mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
341{
342	uint32_t val;
343	unsigned long base, off;
344	char *va, *cfgptr;
345	device_t pdev, mcp55;
346	uint16_t vendor_id, device_id, word;
347	uintptr_t bus, slot, func, ivend, idev;
348	uint32_t *ptr32;
349
350
351	if (!mxge_nvidia_ecrc_enable)
352		return;
353
354	pdev = device_get_parent(device_get_parent(sc->dev));
355	if (pdev == NULL) {
356		device_printf(sc->dev, "could not find parent?\n");
357		return;
358	}
359	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
360	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
361
362	if (vendor_id != 0x10de)
363		return;
364
365	base = 0;
366
367	if (device_id == 0x005d) {
368		/* ck804, base address is magic */
369		base = 0xe0000000UL;
370	} else if (device_id >= 0x0374 && device_id <= 0x378) {
371		/* mcp55, base address stored in chipset */
372		mcp55 = pci_find_bsf(0, 0, 0);
373		if (mcp55 &&
374		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
375		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
376			word = pci_read_config(mcp55, 0x90, 2);
377			base = ((unsigned long)word & 0x7ffeU) << 25;
378		}
379	}
380	if (!base)
381		return;
382
383	/* XXXX
384	   Test below is commented because it is believed that doing
385	   config read/write beyond 0xff will access the config space
386	   for the next larger function.  Uncomment this and remove
387	   the hacky pmap_mapdev() way of accessing config space when
388	   FreeBSD grows support for extended pcie config space access
389	*/
390#if 0
391	/* See if we can, by some miracle, access the extended
392	   config space */
393	val = pci_read_config(pdev, 0x178, 4);
394	if (val != 0xffffffff) {
395		val |= 0x40;
396		pci_write_config(pdev, 0x178, val, 4);
397		return;
398	}
399#endif
400	/* Rather than using normal pci config space writes, we must
401	 * map the Nvidia config space ourselves.  This is because on
402	 * opteron/nvidia class machine the 0xe000000 mapping is
403	 * handled by the nvidia chipset, that means the internal PCI
404	 * device (the on-chip northbridge), or the amd-8131 bridge
405	 * and things behind them are not visible by this method.
406	 */
407
408	BUS_READ_IVAR(device_get_parent(pdev), pdev,
409		      PCI_IVAR_BUS, &bus);
410	BUS_READ_IVAR(device_get_parent(pdev), pdev,
411		      PCI_IVAR_SLOT, &slot);
412	BUS_READ_IVAR(device_get_parent(pdev), pdev,
413		      PCI_IVAR_FUNCTION, &func);
414	BUS_READ_IVAR(device_get_parent(pdev), pdev,
415		      PCI_IVAR_VENDOR, &ivend);
416	BUS_READ_IVAR(device_get_parent(pdev), pdev,
417		      PCI_IVAR_DEVICE, &idev);
418
419	off =  base
420		+ 0x00100000UL * (unsigned long)bus
421		+ 0x00001000UL * (unsigned long)(func
422						 + 8 * slot);
423
424	/* map it into the kernel */
425	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
426
427
428	if (va == NULL) {
429		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
430		return;
431	}
432	/* get a pointer to the config space mapped into the kernel */
433	cfgptr = va + (off & PAGE_MASK);
434
435	/* make sure that we can really access it */
436	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
437	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
438	if (! (vendor_id == ivend && device_id == idev)) {
439		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
440			      vendor_id, device_id);
441		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
442		return;
443	}
444
445	ptr32 = (uint32_t*)(cfgptr + 0x178);
446	val = *ptr32;
447
448	if (val == 0xffffffff) {
449		device_printf(sc->dev, "extended mapping failed\n");
450		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
451		return;
452	}
453	*ptr32 = val | 0x40;
454	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
455	if (mxge_verbose)
456		device_printf(sc->dev,
457			      "Enabled ECRC on upstream Nvidia bridge "
458			      "at %d:%d:%d\n",
459			      (int)bus, (int)slot, (int)func);
460	return;
461}
462#else
463static void
464mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
465{
466	device_printf(sc->dev,
467		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
468	return;
469}
470#endif
471
472
473static int
474mxge_dma_test(mxge_softc_t *sc, int test_type)
475{
476	mxge_cmd_t cmd;
477	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
478	int status;
479	uint32_t len;
480	char *test = " ";
481
482
483	/* Run a small DMA test.
484	 * The magic multipliers to the length tell the firmware
485	 * to do DMA read, write, or read+write tests.  The
486	 * results are returned in cmd.data0.  The upper 16
487	 * bits of the return is the number of transfers completed.
488	 * The lower 16 bits is the time in 0.5us ticks that the
489	 * transfers took to complete.
490	 */
491
492	len = sc->tx_boundary;
493
494	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
495	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
496	cmd.data2 = len * 0x10000;
497	status = mxge_send_cmd(sc, test_type, &cmd);
498	if (status != 0) {
499		test = "read";
500		goto abort;
501	}
502	sc->read_dma = ((cmd.data0>>16) * len * 2) /
503		(cmd.data0 & 0xffff);
504	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
505	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
506	cmd.data2 = len * 0x1;
507	status = mxge_send_cmd(sc, test_type, &cmd);
508	if (status != 0) {
509		test = "write";
510		goto abort;
511	}
512	sc->write_dma = ((cmd.data0>>16) * len * 2) /
513		(cmd.data0 & 0xffff);
514
515	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
516	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
517	cmd.data2 = len * 0x10001;
518	status = mxge_send_cmd(sc, test_type, &cmd);
519	if (status != 0) {
520		test = "read/write";
521		goto abort;
522	}
523	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
524		(cmd.data0 & 0xffff);
525
526abort:
527	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
528		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
529			      test, status);
530
531	return status;
532}
533
534/*
535 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
536 * when the PCI-E Completion packets are aligned on an 8-byte
537 * boundary.  Some PCI-E chip sets always align Completion packets; on
538 * the ones that do not, the alignment can be enforced by enabling
539 * ECRC generation (if supported).
540 *
541 * When PCI-E Completion packets are not aligned, it is actually more
542 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
543 *
544 * If the driver can neither enable ECRC nor verify that it has
545 * already been enabled, then it must use a firmware image which works
546 * around unaligned completion packets (ethp_z8e.dat), and it should
547 * also ensure that it never gives the device a Read-DMA which is
548 * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
549 * enabled, then the driver should use the aligned (eth_z8e.dat)
550 * firmware image, and set tx_boundary to 4KB.
551 */
552
553static int
554mxge_firmware_probe(mxge_softc_t *sc)
555{
556	device_t dev = sc->dev;
557	int reg, status;
558	uint16_t pectl;
559
560	sc->tx_boundary = 4096;
561	/*
562	 * Verify the max read request size was set to 4KB
563	 * before trying the test with 4KB.
564	 */
565	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
566		pectl = pci_read_config(dev, reg + 0x8, 2);
567		if ((pectl & (5 << 12)) != (5 << 12)) {
568			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
569				      pectl);
570			sc->tx_boundary = 2048;
571		}
572	}
573
574	/*
575	 * load the optimized firmware (which assumes aligned PCIe
576	 * completions) in order to see if it works on this host.
577	 */
578	sc->fw_name = mxge_fw_aligned;
579	status = mxge_load_firmware(sc, 1);
580	if (status != 0) {
581		return status;
582	}
583
584	/*
585	 * Enable ECRC if possible
586	 */
587	mxge_enable_nvidia_ecrc(sc);
588
589	/*
590	 * Run a DMA test which watches for unaligned completions and
591	 * aborts on the first one seen.  Not required on Z8ES or newer.
592	 */
593	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
594		return 0;
595	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
596	if (status == 0)
597		return 0; /* keep the aligned firmware */
598
599	if (status != E2BIG)
600		device_printf(dev, "DMA test failed: %d\n", status);
601	if (status == ENOSYS)
602		device_printf(dev, "Falling back to ethp! "
603			      "Please install up to date fw\n");
604	return status;
605}
606
607static int
608mxge_select_firmware(mxge_softc_t *sc)
609{
610	int aligned = 0;
611	int force_firmware = mxge_force_firmware;
612
613	if (sc->throttle)
614		force_firmware = sc->throttle;
615
616	if (force_firmware != 0) {
617		if (force_firmware == 1)
618			aligned = 1;
619		else
620			aligned = 0;
621		if (mxge_verbose)
622			device_printf(sc->dev,
623				      "Assuming %s completions (forced)\n",
624				      aligned ? "aligned" : "unaligned");
625		goto abort;
626	}
627
628	/* if the PCIe link width is 4 or less, we can use the aligned
629	   firmware and skip any checks */
630	if (sc->link_width != 0 && sc->link_width <= 4) {
631		device_printf(sc->dev,
632			      "PCIe x%d Link, expect reduced performance\n",
633			      sc->link_width);
634		aligned = 1;
635		goto abort;
636	}
637
638	if (0 == mxge_firmware_probe(sc))
639		return 0;
640
641abort:
642	if (aligned) {
643		sc->fw_name = mxge_fw_aligned;
644		sc->tx_boundary = 4096;
645	} else {
646		sc->fw_name = mxge_fw_unaligned;
647		sc->tx_boundary = 2048;
648	}
649	return (mxge_load_firmware(sc, 0));
650}
651
652static int
653mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
654{
655
656
657	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
658		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
659			      be32toh(hdr->mcp_type));
660		return EIO;
661	}
662
663	/* save firmware version for sysctl */
664	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
665	if (mxge_verbose)
666		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
667
668	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
669	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
670
671	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
672	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
673		device_printf(sc->dev, "Found firmware version %s\n",
674			      sc->fw_version);
675		device_printf(sc->dev, "Driver needs %d.%d\n",
676			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
677		return EINVAL;
678	}
679	return 0;
680
681}
682
683static void *
684z_alloc(void *nil, u_int items, u_int size)
685{
686        void *ptr;
687
688        ptr = malloc(items * size, M_TEMP, M_NOWAIT);
689        return ptr;
690}
691
692static void
693z_free(void *nil, void *ptr)
694{
695        free(ptr, M_TEMP);
696}
697
698
699static int
700mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
701{
702	z_stream zs;
703	char *inflate_buffer;
704	const struct firmware *fw;
705	const mcp_gen_header_t *hdr;
706	unsigned hdr_offset;
707	int status;
708	unsigned int i;
709	char dummy;
710	size_t fw_len;
711
712	fw = firmware_get(sc->fw_name);
713	if (fw == NULL) {
714		device_printf(sc->dev, "Could not find firmware image %s\n",
715			      sc->fw_name);
716		return ENOENT;
717	}
718
719
720
721	/* setup zlib and decompress f/w */
722	bzero(&zs, sizeof (zs));
723	zs.zalloc = z_alloc;
724	zs.zfree = z_free;
725	status = inflateInit(&zs);
726	if (status != Z_OK) {
727		status = EIO;
728		goto abort_with_fw;
729	}
730
731	/* the uncompressed size is stored as the firmware version,
732	   which would otherwise go unused */
733	fw_len = (size_t) fw->version;
734	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
735	if (inflate_buffer == NULL)
736		goto abort_with_zs;
737	zs.avail_in = fw->datasize;
738	zs.next_in = __DECONST(char *, fw->data);
739	zs.avail_out = fw_len;
740	zs.next_out = inflate_buffer;
741	status = inflate(&zs, Z_FINISH);
742	if (status != Z_STREAM_END) {
743		device_printf(sc->dev, "zlib %d\n", status);
744		status = EIO;
745		goto abort_with_buffer;
746	}
747
748	/* check id */
749	hdr_offset = htobe32(*(const uint32_t *)
750			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
751	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
752		device_printf(sc->dev, "Bad firmware file");
753		status = EIO;
754		goto abort_with_buffer;
755	}
756	hdr = (const void*)(inflate_buffer + hdr_offset);
757
758	status = mxge_validate_firmware(sc, hdr);
759	if (status != 0)
760		goto abort_with_buffer;
761
762	/* Copy the inflated firmware to NIC SRAM. */
763	for (i = 0; i < fw_len; i += 256) {
764		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
765			      inflate_buffer + i,
766			      min(256U, (unsigned)(fw_len - i)));
767		wmb();
768		dummy = *sc->sram;
769		wmb();
770	}
771
772	*limit = fw_len;
773	status = 0;
774abort_with_buffer:
775	free(inflate_buffer, M_TEMP);
776abort_with_zs:
777	inflateEnd(&zs);
778abort_with_fw:
779	firmware_put(fw, FIRMWARE_UNLOAD);
780	return status;
781}
782
783/*
784 * Enable or disable periodic RDMAs from the host to make certain
785 * chipsets resend dropped PCIe messages
786 */
787
788static void
789mxge_dummy_rdma(mxge_softc_t *sc, int enable)
790{
791	char buf_bytes[72];
792	volatile uint32_t *confirm;
793	volatile char *submit;
794	uint32_t *buf, dma_low, dma_high;
795	int i;
796
797	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
798
799	/* clear confirmation addr */
800	confirm = (volatile uint32_t *)sc->cmd;
801	*confirm = 0;
802	wmb();
803
804	/* send an rdma command to the PCIe engine, and wait for the
805	   response in the confirmation address.  The firmware should
806	   write a -1 there to indicate it is alive and well
807	*/
808
809	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
810	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
811	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
812	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
813	buf[2] = htobe32(0xffffffff);		/* confirm data */
814	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
815	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
816	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
817	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
818	buf[5] = htobe32(enable);			/* enable? */
819
820
821	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
822
823	mxge_pio_copy(submit, buf, 64);
824	wmb();
825	DELAY(1000);
826	wmb();
827	i = 0;
828	while (*confirm != 0xffffffff && i < 20) {
829		DELAY(1000);
830		i++;
831	}
832	if (*confirm != 0xffffffff) {
833		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
834			      (enable ? "enable" : "disable"), confirm,
835			      *confirm);
836	}
837	return;
838}
839
840static int
841mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
842{
843	mcp_cmd_t *buf;
844	char buf_bytes[sizeof(*buf) + 8];
845	volatile mcp_cmd_response_t *response = sc->cmd;
846	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
847	uint32_t dma_low, dma_high;
848	int err, sleep_total = 0;
849
850	/* ensure buf is aligned to 8 bytes */
851	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
852
853	buf->data0 = htobe32(data->data0);
854	buf->data1 = htobe32(data->data1);
855	buf->data2 = htobe32(data->data2);
856	buf->cmd = htobe32(cmd);
857	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
858	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
859
860	buf->response_addr.low = htobe32(dma_low);
861	buf->response_addr.high = htobe32(dma_high);
862	mtx_lock(&sc->cmd_mtx);
863	response->result = 0xffffffff;
864	wmb();
865	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
866
867	/* wait up to 20ms */
868	err = EAGAIN;
869	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
870		bus_dmamap_sync(sc->cmd_dma.dmat,
871				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
872		wmb();
873		switch (be32toh(response->result)) {
874		case 0:
875			data->data0 = be32toh(response->data);
876			err = 0;
877			break;
878		case 0xffffffff:
879			DELAY(1000);
880			break;
881		case MXGEFW_CMD_UNKNOWN:
882			err = ENOSYS;
883			break;
884		case MXGEFW_CMD_ERROR_UNALIGNED:
885			err = E2BIG;
886			break;
887		case MXGEFW_CMD_ERROR_BUSY:
888			err = EBUSY;
889			break;
890		case MXGEFW_CMD_ERROR_I2C_ABSENT:
891			err = ENXIO;
892			break;
893		default:
894			device_printf(sc->dev,
895				      "mxge: command %d "
896				      "failed, result = %d\n",
897				      cmd, be32toh(response->result));
898			err = ENXIO;
899			break;
900		}
901		if (err != EAGAIN)
902			break;
903	}
904	if (err == EAGAIN)
905		device_printf(sc->dev, "mxge: command %d timed out"
906			      "result = %d\n",
907			      cmd, be32toh(response->result));
908	mtx_unlock(&sc->cmd_mtx);
909	return err;
910}
911
912static int
913mxge_adopt_running_firmware(mxge_softc_t *sc)
914{
915	struct mcp_gen_header *hdr;
916	const size_t bytes = sizeof (struct mcp_gen_header);
917	size_t hdr_offset;
918	int status;
919
920	/* find running firmware header */
921	hdr_offset = htobe32(*(volatile uint32_t *)
922			     (sc->sram + MCP_HEADER_PTR_OFFSET));
923
924	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
925		device_printf(sc->dev,
926			      "Running firmware has bad header offset (%d)\n",
927			      (int)hdr_offset);
928		return EIO;
929	}
930
931	/* copy header of running firmware from SRAM to host memory to
932	 * validate firmware */
933	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
934	if (hdr == NULL) {
935		device_printf(sc->dev, "could not malloc firmware hdr\n");
936		return ENOMEM;
937	}
938	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
939				rman_get_bushandle(sc->mem_res),
940				hdr_offset, (char *)hdr, bytes);
941	status = mxge_validate_firmware(sc, hdr);
942	free(hdr, M_DEVBUF);
943
944	/*
945	 * check to see if adopted firmware has bug where adopting
946	 * it will cause broadcasts to be filtered unless the NIC
947	 * is kept in ALLMULTI mode
948	 */
949	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
950	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
951		sc->adopted_rx_filter_bug = 1;
952		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
953			      "working around rx filter bug\n",
954			      sc->fw_ver_major, sc->fw_ver_minor,
955			      sc->fw_ver_tiny);
956	}
957
958	return status;
959}
960
961
962static int
963mxge_load_firmware(mxge_softc_t *sc, int adopt)
964{
965	volatile uint32_t *confirm;
966	volatile char *submit;
967	char buf_bytes[72];
968	uint32_t *buf, size, dma_low, dma_high;
969	int status, i;
970
971	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
972
973	size = sc->sram_size;
974	status = mxge_load_firmware_helper(sc, &size);
975	if (status) {
976		if (!adopt)
977			return status;
978		/* Try to use the currently running firmware, if
979		   it is new enough */
980		status = mxge_adopt_running_firmware(sc);
981		if (status) {
982			device_printf(sc->dev,
983				      "failed to adopt running firmware\n");
984			return status;
985		}
986		device_printf(sc->dev,
987			      "Successfully adopted running firmware\n");
988		if (sc->tx_boundary == 4096) {
989			device_printf(sc->dev,
990				"Using firmware currently running on NIC"
991				 ".  For optimal\n");
992			device_printf(sc->dev,
993				 "performance consider loading optimized "
994				 "firmware\n");
995		}
996		sc->fw_name = mxge_fw_unaligned;
997		sc->tx_boundary = 2048;
998		return 0;
999	}
1000	/* clear confirmation addr */
1001	confirm = (volatile uint32_t *)sc->cmd;
1002	*confirm = 0;
1003	wmb();
1004	/* send a reload command to the bootstrap MCP, and wait for the
1005	   response in the confirmation address.  The firmware should
1006	   write a -1 there to indicate it is alive and well
1007	*/
1008
1009	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1010	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1011
1012	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1013	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1014	buf[2] = htobe32(0xffffffff);	/* confirm data */
1015
1016	/* FIX: All newest firmware should un-protect the bottom of
1017	   the sram before handoff. However, the very first interfaces
1018	   do not. Therefore the handoff copy must skip the first 8 bytes
1019	*/
1020					/* where the code starts*/
1021	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1022	buf[4] = htobe32(size - 8); 	/* length of code */
1023	buf[5] = htobe32(8);		/* where to copy to */
1024	buf[6] = htobe32(0);		/* where to jump to */
1025
1026	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1027	mxge_pio_copy(submit, buf, 64);
1028	wmb();
1029	DELAY(1000);
1030	wmb();
1031	i = 0;
1032	while (*confirm != 0xffffffff && i < 20) {
1033		DELAY(1000*10);
1034		i++;
1035		bus_dmamap_sync(sc->cmd_dma.dmat,
1036				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1037	}
1038	if (*confirm != 0xffffffff) {
1039		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1040			confirm, *confirm);
1041
1042		return ENXIO;
1043	}
1044	return 0;
1045}
1046
1047static int
1048mxge_update_mac_address(mxge_softc_t *sc)
1049{
1050	mxge_cmd_t cmd;
1051	uint8_t *addr = sc->mac_addr;
1052	int status;
1053
1054
1055	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1056		     | (addr[2] << 8) | addr[3]);
1057
1058	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1059
1060	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1061	return status;
1062}
1063
1064static int
1065mxge_change_pause(mxge_softc_t *sc, int pause)
1066{
1067	mxge_cmd_t cmd;
1068	int status;
1069
1070	if (pause)
1071		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1072				       &cmd);
1073	else
1074		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1075				       &cmd);
1076
1077	if (status) {
1078		device_printf(sc->dev, "Failed to set flow control mode\n");
1079		return ENXIO;
1080	}
1081	sc->pause = pause;
1082	return 0;
1083}
1084
1085static void
1086mxge_change_promisc(mxge_softc_t *sc, int promisc)
1087{
1088	mxge_cmd_t cmd;
1089	int status;
1090
1091	if (mxge_always_promisc)
1092		promisc = 1;
1093
1094	if (promisc)
1095		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1096				       &cmd);
1097	else
1098		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1099				       &cmd);
1100
1101	if (status) {
1102		device_printf(sc->dev, "Failed to set promisc mode\n");
1103	}
1104}
1105
1106static void
1107mxge_set_multicast_list(mxge_softc_t *sc)
1108{
1109	mxge_cmd_t cmd;
1110	struct ifmultiaddr *ifma;
1111	struct ifnet *ifp = sc->ifp;
1112	int err;
1113
1114	/* This firmware is known to not support multicast */
1115	if (!sc->fw_multicast_support)
1116		return;
1117
1118	/* Disable multicast filtering while we play with the lists*/
1119	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1120	if (err != 0) {
1121		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1122		       " error status: %d\n", err);
1123		return;
1124	}
1125
1126	if (sc->adopted_rx_filter_bug)
1127		return;
1128
1129	if (ifp->if_flags & IFF_ALLMULTI)
1130		/* request to disable multicast filtering, so quit here */
1131		return;
1132
1133	/* Flush all the filters */
1134
1135	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1136	if (err != 0) {
1137		device_printf(sc->dev,
1138			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1139			      ", error status: %d\n", err);
1140		return;
1141	}
1142
1143	/* Walk the multicast list, and add each address */
1144
1145	if_maddr_rlock(ifp);
1146	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1147		if (ifma->ifma_addr->sa_family != AF_LINK)
1148			continue;
1149		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1150		      &cmd.data0, 4);
1151		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1152		      &cmd.data1, 2);
1153		cmd.data0 = htonl(cmd.data0);
1154		cmd.data1 = htonl(cmd.data1);
1155		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1156		if (err != 0) {
1157			device_printf(sc->dev, "Failed "
1158			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1159			       "%d\t", err);
1160			/* abort, leaving multicast filtering off */
1161			if_maddr_runlock(ifp);
1162			return;
1163		}
1164	}
1165	if_maddr_runlock(ifp);
1166	/* Enable multicast filtering */
1167	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1168	if (err != 0) {
1169		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1170		       ", error status: %d\n", err);
1171	}
1172}
1173
1174static int
1175mxge_max_mtu(mxge_softc_t *sc)
1176{
1177	mxge_cmd_t cmd;
1178	int status;
1179
1180	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1181		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1182
1183	/* try to set nbufs to see if it we can
1184	   use virtually contiguous jumbos */
1185	cmd.data0 = 0;
1186	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1187			       &cmd);
1188	if (status == 0)
1189		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1190
1191	/* otherwise, we're limited to MJUMPAGESIZE */
1192	return MJUMPAGESIZE - MXGEFW_PAD;
1193}
1194
1195static int
1196mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1197{
1198	struct mxge_slice_state *ss;
1199	mxge_rx_done_t *rx_done;
1200	volatile uint32_t *irq_claim;
1201	mxge_cmd_t cmd;
1202	int slice, status;
1203
1204	/* try to send a reset command to the card to see if it
1205	   is alive */
1206	memset(&cmd, 0, sizeof (cmd));
1207	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1208	if (status != 0) {
1209		device_printf(sc->dev, "failed reset\n");
1210		return ENXIO;
1211	}
1212
1213	mxge_dummy_rdma(sc, 1);
1214
1215
1216	/* set the intrq size */
1217	cmd.data0 = sc->rx_ring_size;
1218	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1219
1220	/*
1221	 * Even though we already know how many slices are supported
1222	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1223	 * has magic side effects, and must be called after a reset.
1224	 * It must be called prior to calling any RSS related cmds,
1225	 * including assigning an interrupt queue for anything but
1226	 * slice 0.  It must also be called *after*
1227	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1228	 * the firmware to compute offsets.
1229	 */
1230
1231	if (sc->num_slices > 1) {
1232		/* ask the maximum number of slices it supports */
1233		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1234					   &cmd);
1235		if (status != 0) {
1236			device_printf(sc->dev,
1237				      "failed to get number of slices\n");
1238			return status;
1239		}
1240		/*
1241		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1242		 * to setting up the interrupt queue DMA
1243		 */
1244		cmd.data0 = sc->num_slices;
1245		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1246#ifdef IFNET_BUF_RING
1247		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1248#endif
1249		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1250					   &cmd);
1251		if (status != 0) {
1252			device_printf(sc->dev,
1253				      "failed to set number of slices\n");
1254			return status;
1255		}
1256	}
1257
1258
1259	if (interrupts_setup) {
1260		/* Now exchange information about interrupts  */
1261		for (slice = 0; slice < sc->num_slices; slice++) {
1262			rx_done = &sc->ss[slice].rx_done;
1263			memset(rx_done->entry, 0, sc->rx_ring_size);
1264			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1265			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1266			cmd.data2 = slice;
1267			status |= mxge_send_cmd(sc,
1268						MXGEFW_CMD_SET_INTRQ_DMA,
1269						&cmd);
1270		}
1271	}
1272
1273	status |= mxge_send_cmd(sc,
1274				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1275
1276
1277	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1278
1279	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1280	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1281
1282
1283	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1284				&cmd);
1285	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1286	if (status != 0) {
1287		device_printf(sc->dev, "failed set interrupt parameters\n");
1288		return status;
1289	}
1290
1291
1292	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1293
1294
1295	/* run a DMA benchmark */
1296	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1297
1298	for (slice = 0; slice < sc->num_slices; slice++) {
1299		ss = &sc->ss[slice];
1300
1301		ss->irq_claim = irq_claim + (2 * slice);
1302		/* reset mcp/driver shared state back to 0 */
1303		ss->rx_done.idx = 0;
1304		ss->rx_done.cnt = 0;
1305		ss->tx.req = 0;
1306		ss->tx.done = 0;
1307		ss->tx.pkt_done = 0;
1308		ss->tx.queue_active = 0;
1309		ss->tx.activate = 0;
1310		ss->tx.deactivate = 0;
1311		ss->tx.wake = 0;
1312		ss->tx.defrag = 0;
1313		ss->tx.stall = 0;
1314		ss->rx_big.cnt = 0;
1315		ss->rx_small.cnt = 0;
1316		ss->lc.lro_bad_csum = 0;
1317		ss->lc.lro_queued = 0;
1318		ss->lc.lro_flushed = 0;
1319		if (ss->fw_stats != NULL) {
1320			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1321		}
1322	}
1323	sc->rdma_tags_available = 15;
1324	status = mxge_update_mac_address(sc);
1325	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1326	mxge_change_pause(sc, sc->pause);
1327	mxge_set_multicast_list(sc);
1328	if (sc->throttle) {
1329		cmd.data0 = sc->throttle;
1330		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1331				  &cmd)) {
1332			device_printf(sc->dev,
1333				      "can't enable throttle\n");
1334		}
1335	}
1336	return status;
1337}
1338
1339static int
1340mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1341{
1342	mxge_cmd_t cmd;
1343	mxge_softc_t *sc;
1344	int err;
1345	unsigned int throttle;
1346
1347	sc = arg1;
1348	throttle = sc->throttle;
1349	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1350        if (err != 0) {
1351                return err;
1352        }
1353
1354	if (throttle == sc->throttle)
1355		return 0;
1356
1357        if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1358                return EINVAL;
1359
1360	mtx_lock(&sc->driver_mtx);
1361	cmd.data0 = throttle;
1362	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1363	if (err == 0)
1364		sc->throttle = throttle;
1365	mtx_unlock(&sc->driver_mtx);
1366	return err;
1367}
1368
1369static int
1370mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1371{
1372        mxge_softc_t *sc;
1373        unsigned int intr_coal_delay;
1374        int err;
1375
1376        sc = arg1;
1377        intr_coal_delay = sc->intr_coal_delay;
1378        err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1379        if (err != 0) {
1380                return err;
1381        }
1382        if (intr_coal_delay == sc->intr_coal_delay)
1383                return 0;
1384
1385        if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1386                return EINVAL;
1387
1388	mtx_lock(&sc->driver_mtx);
1389	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1390	sc->intr_coal_delay = intr_coal_delay;
1391
1392	mtx_unlock(&sc->driver_mtx);
1393        return err;
1394}
1395
1396static int
1397mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1398{
1399        mxge_softc_t *sc;
1400        unsigned int enabled;
1401        int err;
1402
1403        sc = arg1;
1404        enabled = sc->pause;
1405        err = sysctl_handle_int(oidp, &enabled, arg2, req);
1406        if (err != 0) {
1407                return err;
1408        }
1409        if (enabled == sc->pause)
1410                return 0;
1411
1412	mtx_lock(&sc->driver_mtx);
1413	err = mxge_change_pause(sc, enabled);
1414	mtx_unlock(&sc->driver_mtx);
1415        return err;
1416}
1417
1418static int
1419mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1420{
1421        int err;
1422
1423        if (arg1 == NULL)
1424                return EFAULT;
1425        arg2 = be32toh(*(int *)arg1);
1426        arg1 = NULL;
1427        err = sysctl_handle_int(oidp, arg1, arg2, req);
1428
1429        return err;
1430}
1431
1432static void
1433mxge_rem_sysctls(mxge_softc_t *sc)
1434{
1435	struct mxge_slice_state *ss;
1436	int slice;
1437
1438	if (sc->slice_sysctl_tree == NULL)
1439		return;
1440
1441	for (slice = 0; slice < sc->num_slices; slice++) {
1442		ss = &sc->ss[slice];
1443		if (ss == NULL || ss->sysctl_tree == NULL)
1444			continue;
1445		sysctl_ctx_free(&ss->sysctl_ctx);
1446		ss->sysctl_tree = NULL;
1447	}
1448	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1449	sc->slice_sysctl_tree = NULL;
1450}
1451
1452static void
1453mxge_add_sysctls(mxge_softc_t *sc)
1454{
1455	struct sysctl_ctx_list *ctx;
1456	struct sysctl_oid_list *children;
1457	mcp_irq_data_t *fw;
1458	struct mxge_slice_state *ss;
1459	int slice;
1460	char slice_num[8];
1461
1462	ctx = device_get_sysctl_ctx(sc->dev);
1463	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1464	fw = sc->ss[0].fw_stats;
1465
1466	/* random information */
1467	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1468		       "firmware_version",
1469		       CTLFLAG_RD, &sc->fw_version,
1470		       0, "firmware version");
1471	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1472		       "serial_number",
1473		       CTLFLAG_RD, &sc->serial_number_string,
1474		       0, "serial number");
1475	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1476		       "product_code",
1477		       CTLFLAG_RD, &sc->product_code_string,
1478		       0, "product_code");
1479	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1480		       "pcie_link_width",
1481		       CTLFLAG_RD, &sc->link_width,
1482		       0, "tx_boundary");
1483	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1484		       "tx_boundary",
1485		       CTLFLAG_RD, &sc->tx_boundary,
1486		       0, "tx_boundary");
1487	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1488		       "write_combine",
1489		       CTLFLAG_RD, &sc->wc,
1490		       0, "write combining PIO?");
1491	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1492		       "read_dma_MBs",
1493		       CTLFLAG_RD, &sc->read_dma,
1494		       0, "DMA Read speed in MB/s");
1495	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1496		       "write_dma_MBs",
1497		       CTLFLAG_RD, &sc->write_dma,
1498		       0, "DMA Write speed in MB/s");
1499	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1500		       "read_write_dma_MBs",
1501		       CTLFLAG_RD, &sc->read_write_dma,
1502		       0, "DMA concurrent Read/Write speed in MB/s");
1503	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1504		       "watchdog_resets",
1505		       CTLFLAG_RD, &sc->watchdog_resets,
1506		       0, "Number of times NIC was reset");
1507
1508
1509	/* performance related tunables */
1510	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1511			"intr_coal_delay",
1512			CTLTYPE_INT|CTLFLAG_RW, sc,
1513			0, mxge_change_intr_coal,
1514			"I", "interrupt coalescing delay in usecs");
1515
1516	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1517			"throttle",
1518			CTLTYPE_INT|CTLFLAG_RW, sc,
1519			0, mxge_change_throttle,
1520			"I", "transmit throttling");
1521
1522	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1523			"flow_control_enabled",
1524			CTLTYPE_INT|CTLFLAG_RW, sc,
1525			0, mxge_change_flow_control,
1526			"I", "interrupt coalescing delay in usecs");
1527
1528	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1529		       "deassert_wait",
1530		       CTLFLAG_RW, &mxge_deassert_wait,
1531		       0, "Wait for IRQ line to go low in ihandler");
1532
1533	/* stats block from firmware is in network byte order.
1534	   Need to swap it */
1535	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1536			"link_up",
1537			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1538			0, mxge_handle_be32,
1539			"I", "link up");
1540	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1541			"rdma_tags_available",
1542			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1543			0, mxge_handle_be32,
1544			"I", "rdma_tags_available");
1545	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1546			"dropped_bad_crc32",
1547			CTLTYPE_INT|CTLFLAG_RD,
1548			&fw->dropped_bad_crc32,
1549			0, mxge_handle_be32,
1550			"I", "dropped_bad_crc32");
1551	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1552			"dropped_bad_phy",
1553			CTLTYPE_INT|CTLFLAG_RD,
1554			&fw->dropped_bad_phy,
1555			0, mxge_handle_be32,
1556			"I", "dropped_bad_phy");
1557	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1558			"dropped_link_error_or_filtered",
1559			CTLTYPE_INT|CTLFLAG_RD,
1560			&fw->dropped_link_error_or_filtered,
1561			0, mxge_handle_be32,
1562			"I", "dropped_link_error_or_filtered");
1563	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1564			"dropped_link_overflow",
1565			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1566			0, mxge_handle_be32,
1567			"I", "dropped_link_overflow");
1568	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1569			"dropped_multicast_filtered",
1570			CTLTYPE_INT|CTLFLAG_RD,
1571			&fw->dropped_multicast_filtered,
1572			0, mxge_handle_be32,
1573			"I", "dropped_multicast_filtered");
1574	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1575			"dropped_no_big_buffer",
1576			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1577			0, mxge_handle_be32,
1578			"I", "dropped_no_big_buffer");
1579	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1580			"dropped_no_small_buffer",
1581			CTLTYPE_INT|CTLFLAG_RD,
1582			&fw->dropped_no_small_buffer,
1583			0, mxge_handle_be32,
1584			"I", "dropped_no_small_buffer");
1585	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1586			"dropped_overrun",
1587			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1588			0, mxge_handle_be32,
1589			"I", "dropped_overrun");
1590	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1591			"dropped_pause",
1592			CTLTYPE_INT|CTLFLAG_RD,
1593			&fw->dropped_pause,
1594			0, mxge_handle_be32,
1595			"I", "dropped_pause");
1596	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1597			"dropped_runt",
1598			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1599			0, mxge_handle_be32,
1600			"I", "dropped_runt");
1601
1602	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1603			"dropped_unicast_filtered",
1604			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1605			0, mxge_handle_be32,
1606			"I", "dropped_unicast_filtered");
1607
1608	/* verbose printing? */
1609	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1610		       "verbose",
1611		       CTLFLAG_RW, &mxge_verbose,
1612		       0, "verbose printing");
1613
1614	/* add counters exported for debugging from all slices */
1615	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1616	sc->slice_sysctl_tree =
1617		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1618				"slice", CTLFLAG_RD, 0, "");
1619
1620	for (slice = 0; slice < sc->num_slices; slice++) {
1621		ss = &sc->ss[slice];
1622		sysctl_ctx_init(&ss->sysctl_ctx);
1623		ctx = &ss->sysctl_ctx;
1624		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1625		sprintf(slice_num, "%d", slice);
1626		ss->sysctl_tree =
1627			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1628					CTLFLAG_RD, 0, "");
1629		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1630		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1631			       "rx_small_cnt",
1632			       CTLFLAG_RD, &ss->rx_small.cnt,
1633			       0, "rx_small_cnt");
1634		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1635			       "rx_big_cnt",
1636			       CTLFLAG_RD, &ss->rx_big.cnt,
1637			       0, "rx_small_cnt");
1638		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1639			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1640			       0, "number of lro merge queues flushed");
1641
1642		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1643			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1644			       0, "number of bad csums preventing LRO");
1645
1646		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1647			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1648			       0, "number of frames appended to lro merge"
1649			       "queues");
1650
1651#ifndef IFNET_BUF_RING
1652		/* only transmit from slice 0 for now */
1653		if (slice > 0)
1654			continue;
1655#endif
1656		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1657			       "tx_req",
1658			       CTLFLAG_RD, &ss->tx.req,
1659			       0, "tx_req");
1660
1661		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1662			       "tx_done",
1663			       CTLFLAG_RD, &ss->tx.done,
1664			       0, "tx_done");
1665		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1666			       "tx_pkt_done",
1667			       CTLFLAG_RD, &ss->tx.pkt_done,
1668			       0, "tx_done");
1669		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1670			       "tx_stall",
1671			       CTLFLAG_RD, &ss->tx.stall,
1672			       0, "tx_stall");
1673		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1674			       "tx_wake",
1675			       CTLFLAG_RD, &ss->tx.wake,
1676			       0, "tx_wake");
1677		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1678			       "tx_defrag",
1679			       CTLFLAG_RD, &ss->tx.defrag,
1680			       0, "tx_defrag");
1681		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1682			       "tx_queue_active",
1683			       CTLFLAG_RD, &ss->tx.queue_active,
1684			       0, "tx_queue_active");
1685		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1686			       "tx_activate",
1687			       CTLFLAG_RD, &ss->tx.activate,
1688			       0, "tx_activate");
1689		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1690			       "tx_deactivate",
1691			       CTLFLAG_RD, &ss->tx.deactivate,
1692			       0, "tx_deactivate");
1693	}
1694}
1695
1696/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1697   backwards one at a time and handle ring wraps */
1698
1699static inline void
1700mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1701			    mcp_kreq_ether_send_t *src, int cnt)
1702{
1703        int idx, starting_slot;
1704        starting_slot = tx->req;
1705        while (cnt > 1) {
1706                cnt--;
1707                idx = (starting_slot + cnt) & tx->mask;
1708                mxge_pio_copy(&tx->lanai[idx],
1709			      &src[cnt], sizeof(*src));
1710                wmb();
1711        }
1712}
1713
1714/*
1715 * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1716 * at most 32 bytes at a time, so as to avoid involving the software
1717 * pio handler in the nic.   We re-write the first segment's flags
1718 * to mark them valid only after writing the entire chain
1719 */
1720
1721static inline void
1722mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1723                  int cnt)
1724{
1725        int idx, i;
1726        uint32_t *src_ints;
1727	volatile uint32_t *dst_ints;
1728        mcp_kreq_ether_send_t *srcp;
1729	volatile mcp_kreq_ether_send_t *dstp, *dst;
1730	uint8_t last_flags;
1731
1732        idx = tx->req & tx->mask;
1733
1734	last_flags = src->flags;
1735	src->flags = 0;
1736        wmb();
1737        dst = dstp = &tx->lanai[idx];
1738        srcp = src;
1739
1740        if ((idx + cnt) < tx->mask) {
1741                for (i = 0; i < (cnt - 1); i += 2) {
1742                        mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1743                        wmb(); /* force write every 32 bytes */
1744                        srcp += 2;
1745                        dstp += 2;
1746                }
1747        } else {
1748                /* submit all but the first request, and ensure
1749                   that it is submitted below */
1750                mxge_submit_req_backwards(tx, src, cnt);
1751                i = 0;
1752        }
1753        if (i < cnt) {
1754                /* submit the first request */
1755                mxge_pio_copy(dstp, srcp, sizeof(*src));
1756                wmb(); /* barrier before setting valid flag */
1757        }
1758
1759        /* re-write the last 32-bits with the valid flags */
1760        src->flags = last_flags;
1761        src_ints = (uint32_t *)src;
1762        src_ints+=3;
1763        dst_ints = (volatile uint32_t *)dst;
1764        dst_ints+=3;
1765        *dst_ints =  *src_ints;
1766        tx->req += cnt;
1767        wmb();
1768}
1769
1770static int
1771mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1772    struct mxge_pkt_info *pi)
1773{
1774	struct ether_vlan_header *eh;
1775	uint16_t etype;
1776	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1777#if IFCAP_TSO6 && defined(INET6)
1778	int nxt;
1779#endif
1780
1781	eh = mtod(m, struct ether_vlan_header *);
1782	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1783		etype = ntohs(eh->evl_proto);
1784		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1785	} else {
1786		etype = ntohs(eh->evl_encap_proto);
1787		pi->ip_off = ETHER_HDR_LEN;
1788	}
1789
1790	switch (etype) {
1791	case ETHERTYPE_IP:
1792		/*
1793		 * ensure ip header is in first mbuf, copy it to a
1794		 * scratch buffer if not
1795		 */
1796		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1797		pi->ip6 = NULL;
1798		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1799			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1800			    ss->scratch);
1801			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1802		}
1803		pi->ip_hlen = pi->ip->ip_hl << 2;
1804		if (!tso)
1805			return 0;
1806
1807		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1808		    sizeof(struct tcphdr))) {
1809			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1810			    sizeof(struct tcphdr), ss->scratch);
1811			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1812		}
1813		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1814		break;
1815#if IFCAP_TSO6 && defined(INET6)
1816	case ETHERTYPE_IPV6:
1817		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1818		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1819			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1820			    ss->scratch);
1821			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1822		}
1823		nxt = 0;
1824		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1825		pi->ip_hlen -= pi->ip_off;
1826		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1827			return EINVAL;
1828
1829		if (!tso)
1830			return 0;
1831
1832		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1833			return EINVAL;
1834
1835		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1836		    sizeof(struct tcphdr))) {
1837			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1838			    sizeof(struct tcphdr), ss->scratch);
1839			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1840		}
1841		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1842		break;
1843#endif
1844	default:
1845		return EINVAL;
1846	}
1847	return 0;
1848}
1849
1850#if IFCAP_TSO4
1851
1852static void
1853mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1854	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1855{
1856	mxge_tx_ring_t *tx;
1857	mcp_kreq_ether_send_t *req;
1858	bus_dma_segment_t *seg;
1859	uint32_t low, high_swapped;
1860	int len, seglen, cum_len, cum_len_next;
1861	int next_is_first, chop, cnt, rdma_count, small;
1862	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1863	uint8_t flags, flags_next;
1864	static int once;
1865
1866	mss = m->m_pkthdr.tso_segsz;
1867
1868	/* negative cum_len signifies to the
1869	 * send loop that we are still in the
1870	 * header portion of the TSO packet.
1871	 */
1872
1873	cksum_offset = pi->ip_off + pi->ip_hlen;
1874	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1875
1876	/* TSO implies checksum offload on this hardware */
1877	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1878		/*
1879		 * If packet has full TCP csum, replace it with pseudo hdr
1880		 * sum that the NIC expects, otherwise the NIC will emit
1881		 * packets with bad TCP checksums.
1882		 */
1883		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1884		if (pi->ip6) {
1885#if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1886			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1887			sum = in6_cksum_pseudo(pi->ip6,
1888			    m->m_pkthdr.len - cksum_offset,
1889			    IPPROTO_TCP, 0);
1890#endif
1891		} else {
1892#ifdef INET
1893			m->m_pkthdr.csum_flags |= CSUM_TCP;
1894			sum = in_pseudo(pi->ip->ip_src.s_addr,
1895			    pi->ip->ip_dst.s_addr,
1896			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1897				    cksum_offset)));
1898#endif
1899		}
1900		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1901		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1902	}
1903	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1904
1905
1906	/* for TSO, pseudo_hdr_offset holds mss.
1907	 * The firmware figures out where to put
1908	 * the checksum by parsing the header. */
1909	pseudo_hdr_offset = htobe16(mss);
1910
1911	if (pi->ip6) {
1912		/*
1913		 * for IPv6 TSO, the "checksum offset" is re-purposed
1914		 * to store the TCP header len
1915		 */
1916		cksum_offset = (pi->tcp->th_off << 2);
1917	}
1918
1919	tx = &ss->tx;
1920	req = tx->req_list;
1921	seg = tx->seg_list;
1922	cnt = 0;
1923	rdma_count = 0;
1924	/* "rdma_count" is the number of RDMAs belonging to the
1925	 * current packet BEFORE the current send request. For
1926	 * non-TSO packets, this is equal to "count".
1927	 * For TSO packets, rdma_count needs to be reset
1928	 * to 0 after a segment cut.
1929	 *
1930	 * The rdma_count field of the send request is
1931	 * the number of RDMAs of the packet starting at
1932	 * that request. For TSO send requests with one ore more cuts
1933	 * in the middle, this is the number of RDMAs starting
1934	 * after the last cut in the request. All previous
1935	 * segments before the last cut implicitly have 1 RDMA.
1936	 *
1937	 * Since the number of RDMAs is not known beforehand,
1938	 * it must be filled-in retroactively - after each
1939	 * segmentation cut or at the end of the entire packet.
1940	 */
1941
1942	while (busdma_seg_cnt) {
1943		/* Break the busdma segment up into pieces*/
1944		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1945		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1946		len = seg->ds_len;
1947
1948		while (len) {
1949			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1950			seglen = len;
1951			cum_len_next = cum_len + seglen;
1952			(req-rdma_count)->rdma_count = rdma_count + 1;
1953			if (__predict_true(cum_len >= 0)) {
1954				/* payload */
1955				chop = (cum_len_next > mss);
1956				cum_len_next = cum_len_next % mss;
1957				next_is_first = (cum_len_next == 0);
1958				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1959				flags_next |= next_is_first *
1960					MXGEFW_FLAGS_FIRST;
1961				rdma_count |= -(chop | next_is_first);
1962				rdma_count += chop & !next_is_first;
1963			} else if (cum_len_next >= 0) {
1964				/* header ends */
1965				rdma_count = -1;
1966				cum_len_next = 0;
1967				seglen = -cum_len;
1968				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1969				flags_next = MXGEFW_FLAGS_TSO_PLD |
1970					MXGEFW_FLAGS_FIRST |
1971					(small * MXGEFW_FLAGS_SMALL);
1972			    }
1973
1974			req->addr_high = high_swapped;
1975			req->addr_low = htobe32(low);
1976			req->pseudo_hdr_offset = pseudo_hdr_offset;
1977			req->pad = 0;
1978			req->rdma_count = 1;
1979			req->length = htobe16(seglen);
1980			req->cksum_offset = cksum_offset;
1981			req->flags = flags | ((cum_len & 1) *
1982					      MXGEFW_FLAGS_ALIGN_ODD);
1983			low += seglen;
1984			len -= seglen;
1985			cum_len = cum_len_next;
1986			flags = flags_next;
1987			req++;
1988			cnt++;
1989			rdma_count++;
1990			if (cksum_offset != 0 && !pi->ip6) {
1991				if (__predict_false(cksum_offset > seglen))
1992					cksum_offset -= seglen;
1993				else
1994					cksum_offset = 0;
1995			}
1996			if (__predict_false(cnt > tx->max_desc))
1997				goto drop;
1998		}
1999		busdma_seg_cnt--;
2000		seg++;
2001	}
2002	(req-rdma_count)->rdma_count = rdma_count;
2003
2004	do {
2005		req--;
2006		req->flags |= MXGEFW_FLAGS_TSO_LAST;
2007	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2008
2009	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2010	mxge_submit_req(tx, tx->req_list, cnt);
2011#ifdef IFNET_BUF_RING
2012	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2013		/* tell the NIC to start polling this slice */
2014		*tx->send_go = 1;
2015		tx->queue_active = 1;
2016		tx->activate++;
2017		wmb();
2018	}
2019#endif
2020	return;
2021
2022drop:
2023	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2024	m_freem(m);
2025	ss->oerrors++;
2026	if (!once) {
2027		printf("tx->max_desc exceeded via TSO!\n");
2028		printf("mss = %d, %ld, %d!\n", mss,
2029		       (long)seg - (long)tx->seg_list, tx->max_desc);
2030		once = 1;
2031	}
2032	return;
2033
2034}
2035
2036#endif /* IFCAP_TSO4 */
2037
2038#ifdef MXGE_NEW_VLAN_API
2039/*
2040 * We reproduce the software vlan tag insertion from
2041 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2042 * vlan tag insertion. We need to advertise this in order to have the
2043 * vlan interface respect our csum offload flags.
2044 */
2045static struct mbuf *
2046mxge_vlan_tag_insert(struct mbuf *m)
2047{
2048	struct ether_vlan_header *evl;
2049
2050	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2051	if (__predict_false(m == NULL))
2052		return NULL;
2053	if (m->m_len < sizeof(*evl)) {
2054		m = m_pullup(m, sizeof(*evl));
2055		if (__predict_false(m == NULL))
2056			return NULL;
2057	}
2058	/*
2059	 * Transform the Ethernet header into an Ethernet header
2060	 * with 802.1Q encapsulation.
2061	 */
2062	evl = mtod(m, struct ether_vlan_header *);
2063	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2064	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2065	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2066	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2067	m->m_flags &= ~M_VLANTAG;
2068	return m;
2069}
2070#endif /* MXGE_NEW_VLAN_API */
2071
2072static void
2073mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2074{
2075	struct mxge_pkt_info pi = {0,0,0,0};
2076	mxge_softc_t *sc;
2077	mcp_kreq_ether_send_t *req;
2078	bus_dma_segment_t *seg;
2079	struct mbuf *m_tmp;
2080	struct ifnet *ifp;
2081	mxge_tx_ring_t *tx;
2082	int cnt, cum_len, err, i, idx, odd_flag;
2083	uint16_t pseudo_hdr_offset;
2084        uint8_t flags, cksum_offset;
2085
2086
2087	sc = ss->sc;
2088	ifp = sc->ifp;
2089	tx = &ss->tx;
2090
2091#ifdef MXGE_NEW_VLAN_API
2092	if (m->m_flags & M_VLANTAG) {
2093		m = mxge_vlan_tag_insert(m);
2094		if (__predict_false(m == NULL))
2095			goto drop_without_m;
2096	}
2097#endif
2098	if (m->m_pkthdr.csum_flags &
2099	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2100		if (mxge_parse_tx(ss, m, &pi))
2101			goto drop;
2102	}
2103
2104	/* (try to) map the frame for DMA */
2105	idx = tx->req & tx->mask;
2106	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2107				      m, tx->seg_list, &cnt,
2108				      BUS_DMA_NOWAIT);
2109	if (__predict_false(err == EFBIG)) {
2110		/* Too many segments in the chain.  Try
2111		   to defrag */
2112		m_tmp = m_defrag(m, M_NOWAIT);
2113		if (m_tmp == NULL) {
2114			goto drop;
2115		}
2116		ss->tx.defrag++;
2117		m = m_tmp;
2118		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2119					      tx->info[idx].map,
2120					      m, tx->seg_list, &cnt,
2121					      BUS_DMA_NOWAIT);
2122	}
2123	if (__predict_false(err != 0)) {
2124		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2125			      " packet len = %d\n", err, m->m_pkthdr.len);
2126		goto drop;
2127	}
2128	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2129			BUS_DMASYNC_PREWRITE);
2130	tx->info[idx].m = m;
2131
2132#if IFCAP_TSO4
2133	/* TSO is different enough, we handle it in another routine */
2134	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2135		mxge_encap_tso(ss, m, cnt, &pi);
2136		return;
2137	}
2138#endif
2139
2140	req = tx->req_list;
2141	cksum_offset = 0;
2142	pseudo_hdr_offset = 0;
2143	flags = MXGEFW_FLAGS_NO_TSO;
2144
2145	/* checksum offloading? */
2146	if (m->m_pkthdr.csum_flags &
2147	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2148		/* ensure ip header is in first mbuf, copy
2149		   it to a scratch buffer if not */
2150		cksum_offset = pi.ip_off + pi.ip_hlen;
2151		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2152		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2153		req->cksum_offset = cksum_offset;
2154		flags |= MXGEFW_FLAGS_CKSUM;
2155		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2156	} else {
2157		odd_flag = 0;
2158	}
2159	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2160		flags |= MXGEFW_FLAGS_SMALL;
2161
2162	/* convert segments into a request list */
2163	cum_len = 0;
2164	seg = tx->seg_list;
2165	req->flags = MXGEFW_FLAGS_FIRST;
2166	for (i = 0; i < cnt; i++) {
2167		req->addr_low =
2168			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2169		req->addr_high =
2170			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2171		req->length = htobe16(seg->ds_len);
2172		req->cksum_offset = cksum_offset;
2173		if (cksum_offset > seg->ds_len)
2174			cksum_offset -= seg->ds_len;
2175		else
2176			cksum_offset = 0;
2177		req->pseudo_hdr_offset = pseudo_hdr_offset;
2178		req->pad = 0; /* complete solid 16-byte block */
2179		req->rdma_count = 1;
2180		req->flags |= flags | ((cum_len & 1) * odd_flag);
2181		cum_len += seg->ds_len;
2182		seg++;
2183		req++;
2184		req->flags = 0;
2185	}
2186	req--;
2187	/* pad runts to 60 bytes */
2188	if (cum_len < 60) {
2189		req++;
2190		req->addr_low =
2191			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2192		req->addr_high =
2193			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2194		req->length = htobe16(60 - cum_len);
2195		req->cksum_offset = 0;
2196		req->pseudo_hdr_offset = pseudo_hdr_offset;
2197		req->pad = 0; /* complete solid 16-byte block */
2198		req->rdma_count = 1;
2199		req->flags |= flags | ((cum_len & 1) * odd_flag);
2200		cnt++;
2201	}
2202
2203	tx->req_list[0].rdma_count = cnt;
2204#if 0
2205	/* print what the firmware will see */
2206	for (i = 0; i < cnt; i++) {
2207		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2208		    "cso:%d, flags:0x%x, rdma:%d\n",
2209		    i, (int)ntohl(tx->req_list[i].addr_high),
2210		    (int)ntohl(tx->req_list[i].addr_low),
2211		    (int)ntohs(tx->req_list[i].length),
2212		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2213		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2214		    tx->req_list[i].rdma_count);
2215	}
2216	printf("--------------\n");
2217#endif
2218	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2219	mxge_submit_req(tx, tx->req_list, cnt);
2220#ifdef IFNET_BUF_RING
2221	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2222		/* tell the NIC to start polling this slice */
2223		*tx->send_go = 1;
2224		tx->queue_active = 1;
2225		tx->activate++;
2226		wmb();
2227	}
2228#endif
2229	return;
2230
2231drop:
2232	m_freem(m);
2233drop_without_m:
2234	ss->oerrors++;
2235	return;
2236}
2237
2238#ifdef IFNET_BUF_RING
2239static void
2240mxge_qflush(struct ifnet *ifp)
2241{
2242	mxge_softc_t *sc = ifp->if_softc;
2243	mxge_tx_ring_t *tx;
2244	struct mbuf *m;
2245	int slice;
2246
2247	for (slice = 0; slice < sc->num_slices; slice++) {
2248		tx = &sc->ss[slice].tx;
2249		mtx_lock(&tx->mtx);
2250		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2251			m_freem(m);
2252		mtx_unlock(&tx->mtx);
2253	}
2254	if_qflush(ifp);
2255}
2256
2257static inline void
2258mxge_start_locked(struct mxge_slice_state *ss)
2259{
2260	mxge_softc_t *sc;
2261	struct mbuf *m;
2262	struct ifnet *ifp;
2263	mxge_tx_ring_t *tx;
2264
2265	sc = ss->sc;
2266	ifp = sc->ifp;
2267	tx = &ss->tx;
2268
2269	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2270		m = drbr_dequeue(ifp, tx->br);
2271		if (m == NULL) {
2272			return;
2273		}
2274		/* let BPF see it */
2275		BPF_MTAP(ifp, m);
2276
2277		/* give it to the nic */
2278		mxge_encap(ss, m);
2279	}
2280	/* ran out of transmit slots */
2281	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2282	    && (!drbr_empty(ifp, tx->br))) {
2283		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2284		tx->stall++;
2285	}
2286}
2287
2288static int
2289mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2290{
2291	mxge_softc_t *sc;
2292	struct ifnet *ifp;
2293	mxge_tx_ring_t *tx;
2294	int err;
2295
2296	sc = ss->sc;
2297	ifp = sc->ifp;
2298	tx = &ss->tx;
2299
2300	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2301	    IFF_DRV_RUNNING) {
2302		err = drbr_enqueue(ifp, tx->br, m);
2303		return (err);
2304	}
2305
2306	if (!drbr_needs_enqueue(ifp, tx->br) &&
2307	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2308		/* let BPF see it */
2309		BPF_MTAP(ifp, m);
2310		/* give it to the nic */
2311		mxge_encap(ss, m);
2312	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2313		return (err);
2314	}
2315	if (!drbr_empty(ifp, tx->br))
2316		mxge_start_locked(ss);
2317	return (0);
2318}
2319
2320static int
2321mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2322{
2323	mxge_softc_t *sc = ifp->if_softc;
2324	struct mxge_slice_state *ss;
2325	mxge_tx_ring_t *tx;
2326	int err = 0;
2327	int slice;
2328
2329	slice = m->m_pkthdr.flowid;
2330	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2331
2332	ss = &sc->ss[slice];
2333	tx = &ss->tx;
2334
2335	if (mtx_trylock(&tx->mtx)) {
2336		err = mxge_transmit_locked(ss, m);
2337		mtx_unlock(&tx->mtx);
2338	} else {
2339		err = drbr_enqueue(ifp, tx->br, m);
2340	}
2341
2342	return (err);
2343}
2344
2345#else
2346
2347static inline void
2348mxge_start_locked(struct mxge_slice_state *ss)
2349{
2350	mxge_softc_t *sc;
2351	struct mbuf *m;
2352	struct ifnet *ifp;
2353	mxge_tx_ring_t *tx;
2354
2355	sc = ss->sc;
2356	ifp = sc->ifp;
2357	tx = &ss->tx;
2358	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2359		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2360		if (m == NULL) {
2361			return;
2362		}
2363		/* let BPF see it */
2364		BPF_MTAP(ifp, m);
2365
2366		/* give it to the nic */
2367		mxge_encap(ss, m);
2368	}
2369	/* ran out of transmit slots */
2370	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2371		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2372		tx->stall++;
2373	}
2374}
2375#endif
2376static void
2377mxge_start(struct ifnet *ifp)
2378{
2379	mxge_softc_t *sc = ifp->if_softc;
2380	struct mxge_slice_state *ss;
2381
2382	/* only use the first slice for now */
2383	ss = &sc->ss[0];
2384	mtx_lock(&ss->tx.mtx);
2385	mxge_start_locked(ss);
2386	mtx_unlock(&ss->tx.mtx);
2387}
2388
2389/*
2390 * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2391 * at most 32 bytes at a time, so as to avoid involving the software
2392 * pio handler in the nic.   We re-write the first segment's low
2393 * DMA address to mark it valid only after we write the entire chunk
2394 * in a burst
2395 */
2396static inline void
2397mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2398		mcp_kreq_ether_recv_t *src)
2399{
2400	uint32_t low;
2401
2402	low = src->addr_low;
2403	src->addr_low = 0xffffffff;
2404	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2405	wmb();
2406	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2407	wmb();
2408	src->addr_low = low;
2409	dst->addr_low = low;
2410	wmb();
2411}
2412
2413static int
2414mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2415{
2416	bus_dma_segment_t seg;
2417	struct mbuf *m;
2418	mxge_rx_ring_t *rx = &ss->rx_small;
2419	int cnt, err;
2420
2421	m = m_gethdr(M_NOWAIT, MT_DATA);
2422	if (m == NULL) {
2423		rx->alloc_fail++;
2424		err = ENOBUFS;
2425		goto done;
2426	}
2427	m->m_len = MHLEN;
2428	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2429				      &seg, &cnt, BUS_DMA_NOWAIT);
2430	if (err != 0) {
2431		m_free(m);
2432		goto done;
2433	}
2434	rx->info[idx].m = m;
2435	rx->shadow[idx].addr_low =
2436		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2437	rx->shadow[idx].addr_high =
2438		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2439
2440done:
2441	if ((idx & 7) == 7)
2442		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2443	return err;
2444}
2445
2446static int
2447mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2448{
2449	bus_dma_segment_t seg[3];
2450	struct mbuf *m;
2451	mxge_rx_ring_t *rx = &ss->rx_big;
2452	int cnt, err, i;
2453
2454	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2455	if (m == NULL) {
2456		rx->alloc_fail++;
2457		err = ENOBUFS;
2458		goto done;
2459	}
2460	m->m_len = rx->mlen;
2461	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2462				      seg, &cnt, BUS_DMA_NOWAIT);
2463	if (err != 0) {
2464		m_free(m);
2465		goto done;
2466	}
2467	rx->info[idx].m = m;
2468	rx->shadow[idx].addr_low =
2469		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2470	rx->shadow[idx].addr_high =
2471		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2472
2473#if MXGE_VIRT_JUMBOS
2474	for (i = 1; i < cnt; i++) {
2475		rx->shadow[idx + i].addr_low =
2476			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2477		rx->shadow[idx + i].addr_high =
2478			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2479       }
2480#endif
2481
2482done:
2483       for (i = 0; i < rx->nbufs; i++) {
2484		if ((idx & 7) == 7) {
2485			mxge_submit_8rx(&rx->lanai[idx - 7],
2486					&rx->shadow[idx - 7]);
2487		}
2488		idx++;
2489	}
2490	return err;
2491}
2492
2493#ifdef INET6
2494
2495static uint16_t
2496mxge_csum_generic(uint16_t *raw, int len)
2497{
2498	uint32_t csum;
2499
2500
2501	csum = 0;
2502	while (len > 0) {
2503		csum += *raw;
2504		raw++;
2505		len -= 2;
2506	}
2507	csum = (csum >> 16) + (csum & 0xffff);
2508	csum = (csum >> 16) + (csum & 0xffff);
2509	return (uint16_t)csum;
2510}
2511
2512static inline uint16_t
2513mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2514{
2515	uint32_t partial;
2516	int nxt, cksum_offset;
2517	struct ip6_hdr *ip6 = p;
2518	uint16_t c;
2519
2520	nxt = ip6->ip6_nxt;
2521	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2522	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2523		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2524					   IPPROTO_IPV6, &nxt);
2525		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2526			return (1);
2527	}
2528
2529	/*
2530	 * IPv6 headers do not contain a checksum, and hence
2531	 * do not checksum to zero, so they don't "fall out"
2532	 * of the partial checksum calculation like IPv4
2533	 * headers do.  We need to fix the partial checksum by
2534	 * subtracting the checksum of the IPv6 header.
2535	 */
2536
2537	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2538				    ETHER_HDR_LEN);
2539	csum += ~partial;
2540	csum +=	 (csum < ~partial);
2541	csum = (csum >> 16) + (csum & 0xFFFF);
2542	csum = (csum >> 16) + (csum & 0xFFFF);
2543	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2544			     csum);
2545	c ^= 0xffff;
2546	return (c);
2547}
2548#endif /* INET6 */
2549/*
2550 *  Myri10GE hardware checksums are not valid if the sender
2551 *  padded the frame with non-zero padding.  This is because
2552 *  the firmware just does a simple 16-bit 1s complement
2553 *  checksum across the entire frame, excluding the first 14
2554 *  bytes.  It is best to simply to check the checksum and
2555 *  tell the stack about it only if the checksum is good
2556 */
2557
2558static inline uint16_t
2559mxge_rx_csum(struct mbuf *m, int csum)
2560{
2561	struct ether_header *eh;
2562#ifdef INET
2563	struct ip *ip;
2564#endif
2565#if defined(INET) || defined(INET6)
2566	int cap = m->m_pkthdr.rcvif->if_capenable;
2567#endif
2568	uint16_t c, etype;
2569
2570
2571	eh = mtod(m, struct ether_header *);
2572	etype = ntohs(eh->ether_type);
2573	switch (etype) {
2574#ifdef INET
2575	case ETHERTYPE_IP:
2576		if ((cap & IFCAP_RXCSUM) == 0)
2577			return (1);
2578		ip = (struct ip *)(eh + 1);
2579		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2580			return (1);
2581		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2582			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2583				    (ip->ip_hl << 2) + ip->ip_p));
2584		c ^= 0xffff;
2585		break;
2586#endif
2587#ifdef INET6
2588	case ETHERTYPE_IPV6:
2589		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2590			return (1);
2591		c = mxge_rx_csum6((eh + 1), m, csum);
2592		break;
2593#endif
2594	default:
2595		c = 1;
2596	}
2597	return (c);
2598}
2599
2600static void
2601mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2602{
2603	struct ether_vlan_header *evl;
2604	struct ether_header *eh;
2605	uint32_t partial;
2606
2607	evl = mtod(m, struct ether_vlan_header *);
2608	eh = mtod(m, struct ether_header *);
2609
2610	/*
2611	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2612	 * after what the firmware thought was the end of the ethernet
2613	 * header.
2614	 */
2615
2616	/* put checksum into host byte order */
2617	*csum = ntohs(*csum);
2618	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2619	(*csum) += ~partial;
2620	(*csum) +=  ((*csum) < ~partial);
2621	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2622	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2623
2624	/* restore checksum to network byte order;
2625	   later consumers expect this */
2626	*csum = htons(*csum);
2627
2628	/* save the tag */
2629#ifdef MXGE_NEW_VLAN_API
2630	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2631#else
2632	{
2633		struct m_tag *mtag;
2634		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2635				   M_NOWAIT);
2636		if (mtag == NULL)
2637			return;
2638		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2639		m_tag_prepend(m, mtag);
2640	}
2641
2642#endif
2643	m->m_flags |= M_VLANTAG;
2644
2645	/*
2646	 * Remove the 802.1q header by copying the Ethernet
2647	 * addresses over it and adjusting the beginning of
2648	 * the data in the mbuf.  The encapsulated Ethernet
2649	 * type field is already in place.
2650	 */
2651	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2652	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2653	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2654}
2655
2656
2657static inline void
2658mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2659		 uint32_t csum, int lro)
2660{
2661	mxge_softc_t *sc;
2662	struct ifnet *ifp;
2663	struct mbuf *m;
2664	struct ether_header *eh;
2665	mxge_rx_ring_t *rx;
2666	bus_dmamap_t old_map;
2667	int idx;
2668
2669	sc = ss->sc;
2670	ifp = sc->ifp;
2671	rx = &ss->rx_big;
2672	idx = rx->cnt & rx->mask;
2673	rx->cnt += rx->nbufs;
2674	/* save a pointer to the received mbuf */
2675	m = rx->info[idx].m;
2676	/* try to replace the received mbuf */
2677	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2678		/* drop the frame -- the old mbuf is re-cycled */
2679		ifp->if_ierrors++;
2680		return;
2681	}
2682
2683	/* unmap the received buffer */
2684	old_map = rx->info[idx].map;
2685	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2686	bus_dmamap_unload(rx->dmat, old_map);
2687
2688	/* swap the bus_dmamap_t's */
2689	rx->info[idx].map = rx->extra_map;
2690	rx->extra_map = old_map;
2691
2692	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2693	 * aligned */
2694	m->m_data += MXGEFW_PAD;
2695
2696	m->m_pkthdr.rcvif = ifp;
2697	m->m_len = m->m_pkthdr.len = len;
2698	ss->ipackets++;
2699	eh = mtod(m, struct ether_header *);
2700	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2701		mxge_vlan_tag_remove(m, &csum);
2702	}
2703	/* if the checksum is valid, mark it in the mbuf header */
2704
2705	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2706	    (0 == mxge_rx_csum(m, csum))) {
2707		/* Tell the stack that the  checksum is good */
2708		m->m_pkthdr.csum_data = 0xffff;
2709		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2710			CSUM_DATA_VALID;
2711
2712#if defined(INET) || defined (INET6)
2713		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2714			return;
2715#endif
2716	}
2717	/* flowid only valid if RSS hashing is enabled */
2718	if (sc->num_slices > 1) {
2719		m->m_pkthdr.flowid = (ss - sc->ss);
2720		m->m_flags |= M_FLOWID;
2721	}
2722	/* pass the frame up the stack */
2723	(*ifp->if_input)(ifp, m);
2724}
2725
2726static inline void
2727mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2728		   uint32_t csum, int lro)
2729{
2730	mxge_softc_t *sc;
2731	struct ifnet *ifp;
2732	struct ether_header *eh;
2733	struct mbuf *m;
2734	mxge_rx_ring_t *rx;
2735	bus_dmamap_t old_map;
2736	int idx;
2737
2738	sc = ss->sc;
2739	ifp = sc->ifp;
2740	rx = &ss->rx_small;
2741	idx = rx->cnt & rx->mask;
2742	rx->cnt++;
2743	/* save a pointer to the received mbuf */
2744	m = rx->info[idx].m;
2745	/* try to replace the received mbuf */
2746	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2747		/* drop the frame -- the old mbuf is re-cycled */
2748		ifp->if_ierrors++;
2749		return;
2750	}
2751
2752	/* unmap the received buffer */
2753	old_map = rx->info[idx].map;
2754	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2755	bus_dmamap_unload(rx->dmat, old_map);
2756
2757	/* swap the bus_dmamap_t's */
2758	rx->info[idx].map = rx->extra_map;
2759	rx->extra_map = old_map;
2760
2761	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2762	 * aligned */
2763	m->m_data += MXGEFW_PAD;
2764
2765	m->m_pkthdr.rcvif = ifp;
2766	m->m_len = m->m_pkthdr.len = len;
2767	ss->ipackets++;
2768	eh = mtod(m, struct ether_header *);
2769	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2770		mxge_vlan_tag_remove(m, &csum);
2771	}
2772	/* if the checksum is valid, mark it in the mbuf header */
2773	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2774	    (0 == mxge_rx_csum(m, csum))) {
2775		/* Tell the stack that the  checksum is good */
2776		m->m_pkthdr.csum_data = 0xffff;
2777		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2778			CSUM_DATA_VALID;
2779
2780#if defined(INET) || defined (INET6)
2781		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2782			return;
2783#endif
2784	}
2785	/* flowid only valid if RSS hashing is enabled */
2786	if (sc->num_slices > 1) {
2787		m->m_pkthdr.flowid = (ss - sc->ss);
2788		m->m_flags |= M_FLOWID;
2789	}
2790	/* pass the frame up the stack */
2791	(*ifp->if_input)(ifp, m);
2792}
2793
2794static inline void
2795mxge_clean_rx_done(struct mxge_slice_state *ss)
2796{
2797	mxge_rx_done_t *rx_done = &ss->rx_done;
2798	int limit = 0;
2799	uint16_t length;
2800	uint16_t checksum;
2801	int lro;
2802
2803	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2804	while (rx_done->entry[rx_done->idx].length != 0) {
2805		length = ntohs(rx_done->entry[rx_done->idx].length);
2806		rx_done->entry[rx_done->idx].length = 0;
2807		checksum = rx_done->entry[rx_done->idx].checksum;
2808		if (length <= (MHLEN - MXGEFW_PAD))
2809			mxge_rx_done_small(ss, length, checksum, lro);
2810		else
2811			mxge_rx_done_big(ss, length, checksum, lro);
2812		rx_done->cnt++;
2813		rx_done->idx = rx_done->cnt & rx_done->mask;
2814
2815		/* limit potential for livelock */
2816		if (__predict_false(++limit > rx_done->mask / 2))
2817			break;
2818	}
2819#if defined(INET)  || defined (INET6)
2820	while (!SLIST_EMPTY(&ss->lc.lro_active)) {
2821		struct lro_entry *lro = SLIST_FIRST(&ss->lc.lro_active);
2822		SLIST_REMOVE_HEAD(&ss->lc.lro_active, next);
2823		tcp_lro_flush(&ss->lc, lro);
2824	}
2825#endif
2826}
2827
2828
2829static inline void
2830mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2831{
2832	struct ifnet *ifp;
2833	mxge_tx_ring_t *tx;
2834	struct mbuf *m;
2835	bus_dmamap_t map;
2836	int idx;
2837	int *flags;
2838
2839	tx = &ss->tx;
2840	ifp = ss->sc->ifp;
2841	while (tx->pkt_done != mcp_idx) {
2842		idx = tx->done & tx->mask;
2843		tx->done++;
2844		m = tx->info[idx].m;
2845		/* mbuf and DMA map only attached to the first
2846		   segment per-mbuf */
2847		if (m != NULL) {
2848			ss->obytes += m->m_pkthdr.len;
2849			if (m->m_flags & M_MCAST)
2850				ss->omcasts++;
2851			ss->opackets++;
2852			tx->info[idx].m = NULL;
2853			map = tx->info[idx].map;
2854			bus_dmamap_unload(tx->dmat, map);
2855			m_freem(m);
2856		}
2857		if (tx->info[idx].flag) {
2858			tx->info[idx].flag = 0;
2859			tx->pkt_done++;
2860		}
2861	}
2862
2863	/* If we have space, clear IFF_OACTIVE to tell the stack that
2864           its OK to send packets */
2865#ifdef IFNET_BUF_RING
2866	flags = &ss->if_drv_flags;
2867#else
2868	flags = &ifp->if_drv_flags;
2869#endif
2870	mtx_lock(&ss->tx.mtx);
2871	if ((*flags) & IFF_DRV_OACTIVE &&
2872	    tx->req - tx->done < (tx->mask + 1)/4) {
2873		*(flags) &= ~IFF_DRV_OACTIVE;
2874		ss->tx.wake++;
2875		mxge_start_locked(ss);
2876	}
2877#ifdef IFNET_BUF_RING
2878	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2879		/* let the NIC stop polling this queue, since there
2880		 * are no more transmits pending */
2881		if (tx->req == tx->done) {
2882			*tx->send_stop = 1;
2883			tx->queue_active = 0;
2884			tx->deactivate++;
2885			wmb();
2886		}
2887	}
2888#endif
2889	mtx_unlock(&ss->tx.mtx);
2890
2891}
2892
2893static struct mxge_media_type mxge_xfp_media_types[] =
2894{
2895	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2896	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2897	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2898	{0,		(1 << 5),	"10GBASE-ER"},
2899	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2900	{0,		(1 << 3),	"10GBASE-SW"},
2901	{0,		(1 << 2),	"10GBASE-LW"},
2902	{0,		(1 << 1),	"10GBASE-EW"},
2903	{0,		(1 << 0),	"Reserved"}
2904};
2905static struct mxge_media_type mxge_sfp_media_types[] =
2906{
2907	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2908	{0,		(1 << 7),	"Reserved"},
2909	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2910	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2911	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2912	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2913};
2914
2915static void
2916mxge_media_set(mxge_softc_t *sc, int media_type)
2917{
2918
2919
2920	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2921		    0, NULL);
2922	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2923	sc->current_media = media_type;
2924	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2925}
2926
2927static void
2928mxge_media_init(mxge_softc_t *sc)
2929{
2930	char *ptr;
2931	int i;
2932
2933	ifmedia_removeall(&sc->media);
2934	mxge_media_set(sc, IFM_AUTO);
2935
2936	/*
2937	 * parse the product code to deterimine the interface type
2938	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2939	 * after the 3rd dash in the driver's cached copy of the
2940	 * EEPROM's product code string.
2941	 */
2942	ptr = sc->product_code_string;
2943	if (ptr == NULL) {
2944		device_printf(sc->dev, "Missing product code\n");
2945		return;
2946	}
2947
2948	for (i = 0; i < 3; i++, ptr++) {
2949		ptr = index(ptr, '-');
2950		if (ptr == NULL) {
2951			device_printf(sc->dev,
2952				      "only %d dashes in PC?!?\n", i);
2953			return;
2954		}
2955	}
2956	if (*ptr == 'C' || *(ptr +1) == 'C') {
2957		/* -C is CX4 */
2958		sc->connector = MXGE_CX4;
2959		mxge_media_set(sc, IFM_10G_CX4);
2960	} else if (*ptr == 'Q') {
2961		/* -Q is Quad Ribbon Fiber */
2962		sc->connector = MXGE_QRF;
2963		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2964		/* FreeBSD has no media type for Quad ribbon fiber */
2965	} else if (*ptr == 'R') {
2966		/* -R is XFP */
2967		sc->connector = MXGE_XFP;
2968	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2969		/* -S or -2S is SFP+ */
2970		sc->connector = MXGE_SFP;
2971	} else {
2972		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2973	}
2974}
2975
2976/*
2977 * Determine the media type for a NIC.  Some XFPs will identify
2978 * themselves only when their link is up, so this is initiated via a
2979 * link up interrupt.  However, this can potentially take up to
2980 * several milliseconds, so it is run via the watchdog routine, rather
2981 * than in the interrupt handler itself.
2982 */
2983static void
2984mxge_media_probe(mxge_softc_t *sc)
2985{
2986	mxge_cmd_t cmd;
2987	char *cage_type;
2988
2989	struct mxge_media_type *mxge_media_types = NULL;
2990	int i, err, ms, mxge_media_type_entries;
2991	uint32_t byte;
2992
2993	sc->need_media_probe = 0;
2994
2995	if (sc->connector == MXGE_XFP) {
2996		/* -R is XFP */
2997		mxge_media_types = mxge_xfp_media_types;
2998		mxge_media_type_entries =
2999			sizeof (mxge_xfp_media_types) /
3000			sizeof (mxge_xfp_media_types[0]);
3001		byte = MXGE_XFP_COMPLIANCE_BYTE;
3002		cage_type = "XFP";
3003	} else 	if (sc->connector == MXGE_SFP) {
3004		/* -S or -2S is SFP+ */
3005		mxge_media_types = mxge_sfp_media_types;
3006		mxge_media_type_entries =
3007			sizeof (mxge_sfp_media_types) /
3008			sizeof (mxge_sfp_media_types[0]);
3009		cage_type = "SFP+";
3010		byte = 3;
3011	} else {
3012		/* nothing to do; media type cannot change */
3013		return;
3014	}
3015
3016	/*
3017	 * At this point we know the NIC has an XFP cage, so now we
3018	 * try to determine what is in the cage by using the
3019	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
3020	 * register.  We read just one byte, which may take over
3021	 * a millisecond
3022	 */
3023
3024	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3025	cmd.data1 = byte;
3026	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3027	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3028		device_printf(sc->dev, "failed to read XFP\n");
3029	}
3030	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3031		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3032	}
3033	if (err != MXGEFW_CMD_OK) {
3034		return;
3035	}
3036
3037	/* now we wait for the data to be cached */
3038	cmd.data0 = byte;
3039	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3040	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3041		DELAY(1000);
3042		cmd.data0 = byte;
3043		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3044	}
3045	if (err != MXGEFW_CMD_OK) {
3046		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3047			      cage_type, err, ms);
3048		return;
3049	}
3050
3051	if (cmd.data0 == mxge_media_types[0].bitmask) {
3052		if (mxge_verbose)
3053			device_printf(sc->dev, "%s:%s\n", cage_type,
3054				      mxge_media_types[0].name);
3055		if (sc->current_media != mxge_media_types[0].flag) {
3056			mxge_media_init(sc);
3057			mxge_media_set(sc, mxge_media_types[0].flag);
3058		}
3059		return;
3060	}
3061	for (i = 1; i < mxge_media_type_entries; i++) {
3062		if (cmd.data0 & mxge_media_types[i].bitmask) {
3063			if (mxge_verbose)
3064				device_printf(sc->dev, "%s:%s\n",
3065					      cage_type,
3066					      mxge_media_types[i].name);
3067
3068			if (sc->current_media != mxge_media_types[i].flag) {
3069				mxge_media_init(sc);
3070				mxge_media_set(sc, mxge_media_types[i].flag);
3071			}
3072			return;
3073		}
3074	}
3075	if (mxge_verbose)
3076		device_printf(sc->dev, "%s media 0x%x unknown\n",
3077			      cage_type, cmd.data0);
3078
3079	return;
3080}
3081
3082static void
3083mxge_intr(void *arg)
3084{
3085	struct mxge_slice_state *ss = arg;
3086	mxge_softc_t *sc = ss->sc;
3087	mcp_irq_data_t *stats = ss->fw_stats;
3088	mxge_tx_ring_t *tx = &ss->tx;
3089	mxge_rx_done_t *rx_done = &ss->rx_done;
3090	uint32_t send_done_count;
3091	uint8_t valid;
3092
3093
3094#ifndef IFNET_BUF_RING
3095	/* an interrupt on a non-zero slice is implicitly valid
3096	   since MSI-X irqs are not shared */
3097	if (ss != sc->ss) {
3098		mxge_clean_rx_done(ss);
3099		*ss->irq_claim = be32toh(3);
3100		return;
3101	}
3102#endif
3103
3104	/* make sure the DMA has finished */
3105	if (!stats->valid) {
3106		return;
3107	}
3108	valid = stats->valid;
3109
3110	if (sc->legacy_irq) {
3111		/* lower legacy IRQ  */
3112		*sc->irq_deassert = 0;
3113		if (!mxge_deassert_wait)
3114			/* don't wait for conf. that irq is low */
3115			stats->valid = 0;
3116	} else {
3117		stats->valid = 0;
3118	}
3119
3120	/* loop while waiting for legacy irq deassertion */
3121	do {
3122		/* check for transmit completes and receives */
3123		send_done_count = be32toh(stats->send_done_count);
3124		while ((send_done_count != tx->pkt_done) ||
3125		       (rx_done->entry[rx_done->idx].length != 0)) {
3126			if (send_done_count != tx->pkt_done)
3127				mxge_tx_done(ss, (int)send_done_count);
3128			mxge_clean_rx_done(ss);
3129			send_done_count = be32toh(stats->send_done_count);
3130		}
3131		if (sc->legacy_irq && mxge_deassert_wait)
3132			wmb();
3133	} while (*((volatile uint8_t *) &stats->valid));
3134
3135	/* fw link & error stats meaningful only on the first slice */
3136	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3137		if (sc->link_state != stats->link_up) {
3138			sc->link_state = stats->link_up;
3139			if (sc->link_state) {
3140				if_link_state_change(sc->ifp, LINK_STATE_UP);
3141				 sc->ifp->if_baudrate = IF_Gbps(10UL);
3142				if (mxge_verbose)
3143					device_printf(sc->dev, "link up\n");
3144			} else {
3145				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3146				sc->ifp->if_baudrate = 0;
3147				if (mxge_verbose)
3148					device_printf(sc->dev, "link down\n");
3149			}
3150			sc->need_media_probe = 1;
3151		}
3152		if (sc->rdma_tags_available !=
3153		    be32toh(stats->rdma_tags_available)) {
3154			sc->rdma_tags_available =
3155				be32toh(stats->rdma_tags_available);
3156			device_printf(sc->dev, "RDMA timed out! %d tags "
3157				      "left\n", sc->rdma_tags_available);
3158		}
3159
3160		if (stats->link_down) {
3161			sc->down_cnt += stats->link_down;
3162			sc->link_state = 0;
3163			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3164		}
3165	}
3166
3167	/* check to see if we have rx token to pass back */
3168	if (valid & 0x1)
3169	    *ss->irq_claim = be32toh(3);
3170	*(ss->irq_claim + 1) = be32toh(3);
3171}
3172
3173static void
3174mxge_init(void *arg)
3175{
3176	mxge_softc_t *sc = arg;
3177	struct ifnet *ifp = sc->ifp;
3178
3179
3180	mtx_lock(&sc->driver_mtx);
3181	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3182		(void) mxge_open(sc);
3183	mtx_unlock(&sc->driver_mtx);
3184}
3185
3186
3187
3188static void
3189mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3190{
3191	int i;
3192
3193#if defined(INET) || defined(INET6)
3194	tcp_lro_free(&ss->lc);
3195#endif
3196	for (i = 0; i <= ss->rx_big.mask; i++) {
3197		if (ss->rx_big.info[i].m == NULL)
3198			continue;
3199		bus_dmamap_unload(ss->rx_big.dmat,
3200				  ss->rx_big.info[i].map);
3201		m_freem(ss->rx_big.info[i].m);
3202		ss->rx_big.info[i].m = NULL;
3203	}
3204
3205	for (i = 0; i <= ss->rx_small.mask; i++) {
3206		if (ss->rx_small.info[i].m == NULL)
3207			continue;
3208		bus_dmamap_unload(ss->rx_small.dmat,
3209				  ss->rx_small.info[i].map);
3210		m_freem(ss->rx_small.info[i].m);
3211		ss->rx_small.info[i].m = NULL;
3212	}
3213
3214	/* transmit ring used only on the first slice */
3215	if (ss->tx.info == NULL)
3216		return;
3217
3218	for (i = 0; i <= ss->tx.mask; i++) {
3219		ss->tx.info[i].flag = 0;
3220		if (ss->tx.info[i].m == NULL)
3221			continue;
3222		bus_dmamap_unload(ss->tx.dmat,
3223				  ss->tx.info[i].map);
3224		m_freem(ss->tx.info[i].m);
3225		ss->tx.info[i].m = NULL;
3226	}
3227}
3228
3229static void
3230mxge_free_mbufs(mxge_softc_t *sc)
3231{
3232	int slice;
3233
3234	for (slice = 0; slice < sc->num_slices; slice++)
3235		mxge_free_slice_mbufs(&sc->ss[slice]);
3236}
3237
3238static void
3239mxge_free_slice_rings(struct mxge_slice_state *ss)
3240{
3241	int i;
3242
3243
3244	if (ss->rx_done.entry != NULL)
3245		mxge_dma_free(&ss->rx_done.dma);
3246	ss->rx_done.entry = NULL;
3247
3248	if (ss->tx.req_bytes != NULL)
3249		free(ss->tx.req_bytes, M_DEVBUF);
3250	ss->tx.req_bytes = NULL;
3251
3252	if (ss->tx.seg_list != NULL)
3253		free(ss->tx.seg_list, M_DEVBUF);
3254	ss->tx.seg_list = NULL;
3255
3256	if (ss->rx_small.shadow != NULL)
3257		free(ss->rx_small.shadow, M_DEVBUF);
3258	ss->rx_small.shadow = NULL;
3259
3260	if (ss->rx_big.shadow != NULL)
3261		free(ss->rx_big.shadow, M_DEVBUF);
3262	ss->rx_big.shadow = NULL;
3263
3264	if (ss->tx.info != NULL) {
3265		if (ss->tx.dmat != NULL) {
3266			for (i = 0; i <= ss->tx.mask; i++) {
3267				bus_dmamap_destroy(ss->tx.dmat,
3268						   ss->tx.info[i].map);
3269			}
3270			bus_dma_tag_destroy(ss->tx.dmat);
3271		}
3272		free(ss->tx.info, M_DEVBUF);
3273	}
3274	ss->tx.info = NULL;
3275
3276	if (ss->rx_small.info != NULL) {
3277		if (ss->rx_small.dmat != NULL) {
3278			for (i = 0; i <= ss->rx_small.mask; i++) {
3279				bus_dmamap_destroy(ss->rx_small.dmat,
3280						   ss->rx_small.info[i].map);
3281			}
3282			bus_dmamap_destroy(ss->rx_small.dmat,
3283					   ss->rx_small.extra_map);
3284			bus_dma_tag_destroy(ss->rx_small.dmat);
3285		}
3286		free(ss->rx_small.info, M_DEVBUF);
3287	}
3288	ss->rx_small.info = NULL;
3289
3290	if (ss->rx_big.info != NULL) {
3291		if (ss->rx_big.dmat != NULL) {
3292			for (i = 0; i <= ss->rx_big.mask; i++) {
3293				bus_dmamap_destroy(ss->rx_big.dmat,
3294						   ss->rx_big.info[i].map);
3295			}
3296			bus_dmamap_destroy(ss->rx_big.dmat,
3297					   ss->rx_big.extra_map);
3298			bus_dma_tag_destroy(ss->rx_big.dmat);
3299		}
3300		free(ss->rx_big.info, M_DEVBUF);
3301	}
3302	ss->rx_big.info = NULL;
3303}
3304
3305static void
3306mxge_free_rings(mxge_softc_t *sc)
3307{
3308	int slice;
3309
3310	for (slice = 0; slice < sc->num_slices; slice++)
3311		mxge_free_slice_rings(&sc->ss[slice]);
3312}
3313
3314static int
3315mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3316		       int tx_ring_entries)
3317{
3318	mxge_softc_t *sc = ss->sc;
3319	size_t bytes;
3320	int err, i;
3321
3322	/* allocate per-slice receive resources */
3323
3324	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3325	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3326
3327	/* allocate the rx shadow rings */
3328	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3329	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3330
3331	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3332	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3333
3334	/* allocate the rx host info rings */
3335	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3336	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3337
3338	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3339	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3340
3341	/* allocate the rx busdma resources */
3342	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3343				 1,			/* alignment */
3344				 4096,			/* boundary */
3345				 BUS_SPACE_MAXADDR,	/* low */
3346				 BUS_SPACE_MAXADDR,	/* high */
3347				 NULL, NULL,		/* filter */
3348				 MHLEN,			/* maxsize */
3349				 1,			/* num segs */
3350				 MHLEN,			/* maxsegsize */
3351				 BUS_DMA_ALLOCNOW,	/* flags */
3352				 NULL, NULL,		/* lock */
3353				 &ss->rx_small.dmat);	/* tag */
3354	if (err != 0) {
3355		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3356			      err);
3357		return err;
3358	}
3359
3360	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3361				 1,			/* alignment */
3362#if MXGE_VIRT_JUMBOS
3363				 4096,			/* boundary */
3364#else
3365				 0,			/* boundary */
3366#endif
3367				 BUS_SPACE_MAXADDR,	/* low */
3368				 BUS_SPACE_MAXADDR,	/* high */
3369				 NULL, NULL,		/* filter */
3370				 3*4096,		/* maxsize */
3371#if MXGE_VIRT_JUMBOS
3372				 3,			/* num segs */
3373				 4096,			/* maxsegsize*/
3374#else
3375				 1,			/* num segs */
3376				 MJUM9BYTES,		/* maxsegsize*/
3377#endif
3378				 BUS_DMA_ALLOCNOW,	/* flags */
3379				 NULL, NULL,		/* lock */
3380				 &ss->rx_big.dmat);	/* tag */
3381	if (err != 0) {
3382		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3383			      err);
3384		return err;
3385	}
3386	for (i = 0; i <= ss->rx_small.mask; i++) {
3387		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3388					&ss->rx_small.info[i].map);
3389		if (err != 0) {
3390			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3391				      err);
3392			return err;
3393		}
3394	}
3395	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3396				&ss->rx_small.extra_map);
3397	if (err != 0) {
3398		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3399			      err);
3400		return err;
3401	}
3402
3403	for (i = 0; i <= ss->rx_big.mask; i++) {
3404		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3405					&ss->rx_big.info[i].map);
3406		if (err != 0) {
3407			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3408				      err);
3409			return err;
3410		}
3411	}
3412	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3413				&ss->rx_big.extra_map);
3414	if (err != 0) {
3415		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3416			      err);
3417		return err;
3418	}
3419
3420	/* now allocate TX resouces */
3421
3422#ifndef IFNET_BUF_RING
3423	/* only use a single TX ring for now */
3424	if (ss != ss->sc->ss)
3425		return 0;
3426#endif
3427
3428	ss->tx.mask = tx_ring_entries - 1;
3429	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3430
3431
3432	/* allocate the tx request copy block */
3433	bytes = 8 +
3434		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3435	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3436	/* ensure req_list entries are aligned to 8 bytes */
3437	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3438		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3439
3440	/* allocate the tx busdma segment list */
3441	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3442	ss->tx.seg_list = (bus_dma_segment_t *)
3443		malloc(bytes, M_DEVBUF, M_WAITOK);
3444
3445	/* allocate the tx host info ring */
3446	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3447	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3448
3449	/* allocate the tx busdma resources */
3450	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3451				 1,			/* alignment */
3452				 sc->tx_boundary,	/* boundary */
3453				 BUS_SPACE_MAXADDR,	/* low */
3454				 BUS_SPACE_MAXADDR,	/* high */
3455				 NULL, NULL,		/* filter */
3456				 65536 + 256,		/* maxsize */
3457				 ss->tx.max_desc - 2,	/* num segs */
3458				 sc->tx_boundary,	/* maxsegsz */
3459				 BUS_DMA_ALLOCNOW,	/* flags */
3460				 NULL, NULL,		/* lock */
3461				 &ss->tx.dmat);		/* tag */
3462
3463	if (err != 0) {
3464		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3465			      err);
3466		return err;
3467	}
3468
3469	/* now use these tags to setup dmamaps for each slot
3470	   in the ring */
3471	for (i = 0; i <= ss->tx.mask; i++) {
3472		err = bus_dmamap_create(ss->tx.dmat, 0,
3473					&ss->tx.info[i].map);
3474		if (err != 0) {
3475			device_printf(sc->dev, "Err %d  tx dmamap\n",
3476				      err);
3477			return err;
3478		}
3479	}
3480	return 0;
3481
3482}
3483
3484static int
3485mxge_alloc_rings(mxge_softc_t *sc)
3486{
3487	mxge_cmd_t cmd;
3488	int tx_ring_size;
3489	int tx_ring_entries, rx_ring_entries;
3490	int err, slice;
3491
3492	/* get ring sizes */
3493	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3494	tx_ring_size = cmd.data0;
3495	if (err != 0) {
3496		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3497		goto abort;
3498	}
3499
3500	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3501	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3502	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3503	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3504	IFQ_SET_READY(&sc->ifp->if_snd);
3505
3506	for (slice = 0; slice < sc->num_slices; slice++) {
3507		err = mxge_alloc_slice_rings(&sc->ss[slice],
3508					     rx_ring_entries,
3509					     tx_ring_entries);
3510		if (err != 0)
3511			goto abort;
3512	}
3513	return 0;
3514
3515abort:
3516	mxge_free_rings(sc);
3517	return err;
3518
3519}
3520
3521
3522static void
3523mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3524{
3525	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3526
3527	if (bufsize < MCLBYTES) {
3528		/* easy, everything fits in a single buffer */
3529		*big_buf_size = MCLBYTES;
3530		*cl_size = MCLBYTES;
3531		*nbufs = 1;
3532		return;
3533	}
3534
3535	if (bufsize < MJUMPAGESIZE) {
3536		/* still easy, everything still fits in a single buffer */
3537		*big_buf_size = MJUMPAGESIZE;
3538		*cl_size = MJUMPAGESIZE;
3539		*nbufs = 1;
3540		return;
3541	}
3542#if MXGE_VIRT_JUMBOS
3543	/* now we need to use virtually contiguous buffers */
3544	*cl_size = MJUM9BYTES;
3545	*big_buf_size = 4096;
3546	*nbufs = mtu / 4096 + 1;
3547	/* needs to be a power of two, so round up */
3548	if (*nbufs == 3)
3549		*nbufs = 4;
3550#else
3551	*cl_size = MJUM9BYTES;
3552	*big_buf_size = MJUM9BYTES;
3553	*nbufs = 1;
3554#endif
3555}
3556
3557static int
3558mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3559{
3560	mxge_softc_t *sc;
3561	mxge_cmd_t cmd;
3562	bus_dmamap_t map;
3563	int err, i, slice;
3564
3565
3566	sc = ss->sc;
3567	slice = ss - sc->ss;
3568
3569#if defined(INET) || defined(INET6)
3570	(void)tcp_lro_init(&ss->lc);
3571#endif
3572	ss->lc.ifp = sc->ifp;
3573
3574	/* get the lanai pointers to the send and receive rings */
3575
3576	err = 0;
3577#ifndef IFNET_BUF_RING
3578	/* We currently only send from the first slice */
3579	if (slice == 0) {
3580#endif
3581		cmd.data0 = slice;
3582		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3583		ss->tx.lanai =
3584			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3585		ss->tx.send_go = (volatile uint32_t *)
3586			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3587		ss->tx.send_stop = (volatile uint32_t *)
3588		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3589#ifndef IFNET_BUF_RING
3590	}
3591#endif
3592	cmd.data0 = slice;
3593	err |= mxge_send_cmd(sc,
3594			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3595	ss->rx_small.lanai =
3596		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3597	cmd.data0 = slice;
3598	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3599	ss->rx_big.lanai =
3600		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3601
3602	if (err != 0) {
3603		device_printf(sc->dev,
3604			      "failed to get ring sizes or locations\n");
3605		return EIO;
3606	}
3607
3608	/* stock receive rings */
3609	for (i = 0; i <= ss->rx_small.mask; i++) {
3610		map = ss->rx_small.info[i].map;
3611		err = mxge_get_buf_small(ss, map, i);
3612		if (err) {
3613			device_printf(sc->dev, "alloced %d/%d smalls\n",
3614				      i, ss->rx_small.mask + 1);
3615			return ENOMEM;
3616		}
3617	}
3618	for (i = 0; i <= ss->rx_big.mask; i++) {
3619		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3620		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3621	}
3622	ss->rx_big.nbufs = nbufs;
3623	ss->rx_big.cl_size = cl_size;
3624	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3625		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3626	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3627		map = ss->rx_big.info[i].map;
3628		err = mxge_get_buf_big(ss, map, i);
3629		if (err) {
3630			device_printf(sc->dev, "alloced %d/%d bigs\n",
3631				      i, ss->rx_big.mask + 1);
3632			return ENOMEM;
3633		}
3634	}
3635	return 0;
3636}
3637
3638static int
3639mxge_open(mxge_softc_t *sc)
3640{
3641	mxge_cmd_t cmd;
3642	int err, big_bytes, nbufs, slice, cl_size, i;
3643	bus_addr_t bus;
3644	volatile uint8_t *itable;
3645	struct mxge_slice_state *ss;
3646
3647	/* Copy the MAC address in case it was overridden */
3648	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3649
3650	err = mxge_reset(sc, 1);
3651	if (err != 0) {
3652		device_printf(sc->dev, "failed to reset\n");
3653		return EIO;
3654	}
3655
3656	if (sc->num_slices > 1) {
3657		/* setup the indirection table */
3658		cmd.data0 = sc->num_slices;
3659		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3660				    &cmd);
3661
3662		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3663				     &cmd);
3664		if (err != 0) {
3665			device_printf(sc->dev,
3666				      "failed to setup rss tables\n");
3667			return err;
3668		}
3669
3670		/* just enable an identity mapping */
3671		itable = sc->sram + cmd.data0;
3672		for (i = 0; i < sc->num_slices; i++)
3673			itable[i] = (uint8_t)i;
3674
3675		cmd.data0 = 1;
3676		cmd.data1 = mxge_rss_hash_type;
3677		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3678		if (err != 0) {
3679			device_printf(sc->dev, "failed to enable slices\n");
3680			return err;
3681		}
3682	}
3683
3684
3685	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3686
3687	cmd.data0 = nbufs;
3688	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3689			    &cmd);
3690	/* error is only meaningful if we're trying to set
3691	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3692	if (err && nbufs > 1) {
3693		device_printf(sc->dev,
3694			      "Failed to set alway-use-n to %d\n",
3695			      nbufs);
3696		return EIO;
3697	}
3698	/* Give the firmware the mtu and the big and small buffer
3699	   sizes.  The firmware wants the big buf size to be a power
3700	   of two. Luckily, FreeBSD's clusters are powers of two */
3701	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3702	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3703	cmd.data0 = MHLEN - MXGEFW_PAD;
3704	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3705			     &cmd);
3706	cmd.data0 = big_bytes;
3707	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3708
3709	if (err != 0) {
3710		device_printf(sc->dev, "failed to setup params\n");
3711		goto abort;
3712	}
3713
3714	/* Now give him the pointer to the stats block */
3715	for (slice = 0;
3716#ifdef IFNET_BUF_RING
3717	     slice < sc->num_slices;
3718#else
3719	     slice < 1;
3720#endif
3721	     slice++) {
3722		ss = &sc->ss[slice];
3723		cmd.data0 =
3724			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3725		cmd.data1 =
3726			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3727		cmd.data2 = sizeof(struct mcp_irq_data);
3728		cmd.data2 |= (slice << 16);
3729		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3730	}
3731
3732	if (err != 0) {
3733		bus = sc->ss->fw_stats_dma.bus_addr;
3734		bus += offsetof(struct mcp_irq_data, send_done_count);
3735		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3736		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3737		err = mxge_send_cmd(sc,
3738				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3739				    &cmd);
3740		/* Firmware cannot support multicast without STATS_DMA_V2 */
3741		sc->fw_multicast_support = 0;
3742	} else {
3743		sc->fw_multicast_support = 1;
3744	}
3745
3746	if (err != 0) {
3747		device_printf(sc->dev, "failed to setup params\n");
3748		goto abort;
3749	}
3750
3751	for (slice = 0; slice < sc->num_slices; slice++) {
3752		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3753		if (err != 0) {
3754			device_printf(sc->dev, "couldn't open slice %d\n",
3755				      slice);
3756			goto abort;
3757		}
3758	}
3759
3760	/* Finally, start the firmware running */
3761	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3762	if (err) {
3763		device_printf(sc->dev, "Couldn't bring up link\n");
3764		goto abort;
3765	}
3766#ifdef IFNET_BUF_RING
3767	for (slice = 0; slice < sc->num_slices; slice++) {
3768		ss = &sc->ss[slice];
3769		ss->if_drv_flags |= IFF_DRV_RUNNING;
3770		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3771	}
3772#endif
3773	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3774	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3775
3776	return 0;
3777
3778
3779abort:
3780	mxge_free_mbufs(sc);
3781
3782	return err;
3783}
3784
3785static int
3786mxge_close(mxge_softc_t *sc, int down)
3787{
3788	mxge_cmd_t cmd;
3789	int err, old_down_cnt;
3790#ifdef IFNET_BUF_RING
3791	struct mxge_slice_state *ss;
3792	int slice;
3793#endif
3794
3795#ifdef IFNET_BUF_RING
3796	for (slice = 0; slice < sc->num_slices; slice++) {
3797		ss = &sc->ss[slice];
3798		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3799	}
3800#endif
3801	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3802	if (!down) {
3803		old_down_cnt = sc->down_cnt;
3804		wmb();
3805		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3806		if (err) {
3807			device_printf(sc->dev,
3808				      "Couldn't bring down link\n");
3809		}
3810		if (old_down_cnt == sc->down_cnt) {
3811			/* wait for down irq */
3812			DELAY(10 * sc->intr_coal_delay);
3813		}
3814		wmb();
3815		if (old_down_cnt == sc->down_cnt) {
3816			device_printf(sc->dev, "never got down irq\n");
3817		}
3818	}
3819	mxge_free_mbufs(sc);
3820
3821	return 0;
3822}
3823
3824static void
3825mxge_setup_cfg_space(mxge_softc_t *sc)
3826{
3827	device_t dev = sc->dev;
3828	int reg;
3829	uint16_t lnk, pectl;
3830
3831	/* find the PCIe link width and set max read request to 4KB*/
3832	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3833		lnk = pci_read_config(dev, reg + 0x12, 2);
3834		sc->link_width = (lnk >> 4) & 0x3f;
3835
3836		if (sc->pectl == 0) {
3837			pectl = pci_read_config(dev, reg + 0x8, 2);
3838			pectl = (pectl & ~0x7000) | (5 << 12);
3839			pci_write_config(dev, reg + 0x8, pectl, 2);
3840			sc->pectl = pectl;
3841		} else {
3842			/* restore saved pectl after watchdog reset */
3843			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3844		}
3845	}
3846
3847	/* Enable DMA and Memory space access */
3848	pci_enable_busmaster(dev);
3849}
3850
3851static uint32_t
3852mxge_read_reboot(mxge_softc_t *sc)
3853{
3854	device_t dev = sc->dev;
3855	uint32_t vs;
3856
3857	/* find the vendor specific offset */
3858	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3859		device_printf(sc->dev,
3860			      "could not find vendor specific offset\n");
3861		return (uint32_t)-1;
3862	}
3863	/* enable read32 mode */
3864	pci_write_config(dev, vs + 0x10, 0x3, 1);
3865	/* tell NIC which register to read */
3866	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3867	return (pci_read_config(dev, vs + 0x14, 4));
3868}
3869
3870static void
3871mxge_watchdog_reset(mxge_softc_t *sc)
3872{
3873	struct pci_devinfo *dinfo;
3874	struct mxge_slice_state *ss;
3875	int err, running, s, num_tx_slices = 1;
3876	uint32_t reboot;
3877	uint16_t cmd;
3878
3879	err = ENXIO;
3880
3881	device_printf(sc->dev, "Watchdog reset!\n");
3882
3883	/*
3884	 * check to see if the NIC rebooted.  If it did, then all of
3885	 * PCI config space has been reset, and things like the
3886	 * busmaster bit will be zero.  If this is the case, then we
3887	 * must restore PCI config space before the NIC can be used
3888	 * again
3889	 */
3890	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3891	if (cmd == 0xffff) {
3892		/*
3893		 * maybe the watchdog caught the NIC rebooting; wait
3894		 * up to 100ms for it to finish.  If it does not come
3895		 * back, then give up
3896		 */
3897		DELAY(1000*100);
3898		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3899		if (cmd == 0xffff) {
3900			device_printf(sc->dev, "NIC disappeared!\n");
3901		}
3902	}
3903	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3904		/* print the reboot status */
3905		reboot = mxge_read_reboot(sc);
3906		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3907			      reboot);
3908		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3909		if (running) {
3910
3911			/*
3912			 * quiesce NIC so that TX routines will not try to
3913			 * xmit after restoration of BAR
3914			 */
3915
3916			/* Mark the link as down */
3917			if (sc->link_state) {
3918				sc->link_state = 0;
3919				if_link_state_change(sc->ifp,
3920						     LINK_STATE_DOWN);
3921			}
3922#ifdef IFNET_BUF_RING
3923			num_tx_slices = sc->num_slices;
3924#endif
3925			/* grab all TX locks to ensure no tx  */
3926			for (s = 0; s < num_tx_slices; s++) {
3927				ss = &sc->ss[s];
3928				mtx_lock(&ss->tx.mtx);
3929			}
3930			mxge_close(sc, 1);
3931		}
3932		/* restore PCI configuration space */
3933		dinfo = device_get_ivars(sc->dev);
3934		pci_cfg_restore(sc->dev, dinfo);
3935
3936		/* and redo any changes we made to our config space */
3937		mxge_setup_cfg_space(sc);
3938
3939		/* reload f/w */
3940		err = mxge_load_firmware(sc, 0);
3941		if (err) {
3942			device_printf(sc->dev,
3943				      "Unable to re-load f/w\n");
3944		}
3945		if (running) {
3946			if (!err)
3947				err = mxge_open(sc);
3948			/* release all TX locks */
3949			for (s = 0; s < num_tx_slices; s++) {
3950				ss = &sc->ss[s];
3951#ifdef IFNET_BUF_RING
3952				mxge_start_locked(ss);
3953#endif
3954				mtx_unlock(&ss->tx.mtx);
3955			}
3956		}
3957		sc->watchdog_resets++;
3958	} else {
3959		device_printf(sc->dev,
3960			      "NIC did not reboot, not resetting\n");
3961		err = 0;
3962	}
3963	if (err) {
3964		device_printf(sc->dev, "watchdog reset failed\n");
3965	} else {
3966		if (sc->dying == 2)
3967			sc->dying = 0;
3968		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3969	}
3970}
3971
3972static void
3973mxge_watchdog_task(void *arg, int pending)
3974{
3975	mxge_softc_t *sc = arg;
3976
3977
3978	mtx_lock(&sc->driver_mtx);
3979	mxge_watchdog_reset(sc);
3980	mtx_unlock(&sc->driver_mtx);
3981}
3982
3983static void
3984mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3985{
3986	tx = &sc->ss[slice].tx;
3987	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3988	device_printf(sc->dev,
3989		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3990		      tx->req, tx->done, tx->queue_active);
3991	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3992			      tx->activate, tx->deactivate);
3993	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3994		      tx->pkt_done,
3995		      be32toh(sc->ss->fw_stats->send_done_count));
3996}
3997
3998static int
3999mxge_watchdog(mxge_softc_t *sc)
4000{
4001	mxge_tx_ring_t *tx;
4002	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
4003	int i, err = 0;
4004
4005	/* see if we have outstanding transmits, which
4006	   have been pending for more than mxge_ticks */
4007	for (i = 0;
4008#ifdef IFNET_BUF_RING
4009	     (i < sc->num_slices) && (err == 0);
4010#else
4011	     (i < 1) && (err == 0);
4012#endif
4013	     i++) {
4014		tx = &sc->ss[i].tx;
4015		if (tx->req != tx->done &&
4016		    tx->watchdog_req != tx->watchdog_done &&
4017		    tx->done == tx->watchdog_done) {
4018			/* check for pause blocking before resetting */
4019			if (tx->watchdog_rx_pause == rx_pause) {
4020				mxge_warn_stuck(sc, tx, i);
4021				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4022				return (ENXIO);
4023			}
4024			else
4025				device_printf(sc->dev, "Flow control blocking "
4026					      "xmits, check link partner\n");
4027		}
4028
4029		tx->watchdog_req = tx->req;
4030		tx->watchdog_done = tx->done;
4031		tx->watchdog_rx_pause = rx_pause;
4032	}
4033
4034	if (sc->need_media_probe)
4035		mxge_media_probe(sc);
4036	return (err);
4037}
4038
4039static u_long
4040mxge_update_stats(mxge_softc_t *sc)
4041{
4042	struct mxge_slice_state *ss;
4043	u_long pkts = 0;
4044	u_long ipackets = 0;
4045	u_long opackets = 0;
4046#ifdef IFNET_BUF_RING
4047	u_long obytes = 0;
4048	u_long omcasts = 0;
4049	u_long odrops = 0;
4050#endif
4051	u_long oerrors = 0;
4052	int slice;
4053
4054	for (slice = 0; slice < sc->num_slices; slice++) {
4055		ss = &sc->ss[slice];
4056		ipackets += ss->ipackets;
4057		opackets += ss->opackets;
4058#ifdef IFNET_BUF_RING
4059		obytes += ss->obytes;
4060		omcasts += ss->omcasts;
4061		odrops += ss->tx.br->br_drops;
4062#endif
4063		oerrors += ss->oerrors;
4064	}
4065	pkts = (ipackets - sc->ifp->if_ipackets);
4066	pkts += (opackets - sc->ifp->if_opackets);
4067	sc->ifp->if_ipackets = ipackets;
4068	sc->ifp->if_opackets = opackets;
4069#ifdef IFNET_BUF_RING
4070	sc->ifp->if_obytes = obytes;
4071	sc->ifp->if_omcasts = omcasts;
4072	sc->ifp->if_snd.ifq_drops = odrops;
4073#endif
4074	sc->ifp->if_oerrors = oerrors;
4075	return pkts;
4076}
4077
4078static void
4079mxge_tick(void *arg)
4080{
4081	mxge_softc_t *sc = arg;
4082	u_long pkts = 0;
4083	int err = 0;
4084	int running, ticks;
4085	uint16_t cmd;
4086
4087	ticks = mxge_ticks;
4088	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4089	if (running) {
4090		/* aggregate stats from different slices */
4091		pkts = mxge_update_stats(sc);
4092		if (!sc->watchdog_countdown) {
4093			err = mxge_watchdog(sc);
4094			sc->watchdog_countdown = 4;
4095		}
4096		sc->watchdog_countdown--;
4097	}
4098	if (pkts == 0) {
4099		/* ensure NIC did not suffer h/w fault while idle */
4100		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4101		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4102			sc->dying = 2;
4103			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4104			err = ENXIO;
4105		}
4106		/* look less often if NIC is idle */
4107		ticks *= 4;
4108	}
4109
4110	if (err == 0)
4111		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4112
4113}
4114
4115static int
4116mxge_media_change(struct ifnet *ifp)
4117{
4118	return EINVAL;
4119}
4120
4121static int
4122mxge_change_mtu(mxge_softc_t *sc, int mtu)
4123{
4124	struct ifnet *ifp = sc->ifp;
4125	int real_mtu, old_mtu;
4126	int err = 0;
4127
4128
4129	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4130	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4131		return EINVAL;
4132	mtx_lock(&sc->driver_mtx);
4133	old_mtu = ifp->if_mtu;
4134	ifp->if_mtu = mtu;
4135	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4136		mxge_close(sc, 0);
4137		err = mxge_open(sc);
4138		if (err != 0) {
4139			ifp->if_mtu = old_mtu;
4140			mxge_close(sc, 0);
4141			(void) mxge_open(sc);
4142		}
4143	}
4144	mtx_unlock(&sc->driver_mtx);
4145	return err;
4146}
4147
4148static void
4149mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4150{
4151	mxge_softc_t *sc = ifp->if_softc;
4152
4153
4154	if (sc == NULL)
4155		return;
4156	ifmr->ifm_status = IFM_AVALID;
4157	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4158	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4159	ifmr->ifm_active |= sc->current_media;
4160}
4161
4162static int
4163mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4164{
4165	mxge_softc_t *sc = ifp->if_softc;
4166	struct ifreq *ifr = (struct ifreq *)data;
4167	int err, mask;
4168
4169	err = 0;
4170	switch (command) {
4171	case SIOCSIFADDR:
4172	case SIOCGIFADDR:
4173		err = ether_ioctl(ifp, command, data);
4174		break;
4175
4176	case SIOCSIFMTU:
4177		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4178		break;
4179
4180	case SIOCSIFFLAGS:
4181		mtx_lock(&sc->driver_mtx);
4182		if (sc->dying) {
4183			mtx_unlock(&sc->driver_mtx);
4184			return EINVAL;
4185		}
4186		if (ifp->if_flags & IFF_UP) {
4187			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4188				err = mxge_open(sc);
4189			} else {
4190				/* take care of promis can allmulti
4191				   flag chages */
4192				mxge_change_promisc(sc,
4193						    ifp->if_flags & IFF_PROMISC);
4194				mxge_set_multicast_list(sc);
4195			}
4196		} else {
4197			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4198				mxge_close(sc, 0);
4199			}
4200		}
4201		mtx_unlock(&sc->driver_mtx);
4202		break;
4203
4204	case SIOCADDMULTI:
4205	case SIOCDELMULTI:
4206		mtx_lock(&sc->driver_mtx);
4207		mxge_set_multicast_list(sc);
4208		mtx_unlock(&sc->driver_mtx);
4209		break;
4210
4211	case SIOCSIFCAP:
4212		mtx_lock(&sc->driver_mtx);
4213		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4214		if (mask & IFCAP_TXCSUM) {
4215			if (IFCAP_TXCSUM & ifp->if_capenable) {
4216				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4217				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4218			} else {
4219				ifp->if_capenable |= IFCAP_TXCSUM;
4220				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4221			}
4222		} else if (mask & IFCAP_RXCSUM) {
4223			if (IFCAP_RXCSUM & ifp->if_capenable) {
4224				ifp->if_capenable &= ~IFCAP_RXCSUM;
4225			} else {
4226				ifp->if_capenable |= IFCAP_RXCSUM;
4227			}
4228		}
4229		if (mask & IFCAP_TSO4) {
4230			if (IFCAP_TSO4 & ifp->if_capenable) {
4231				ifp->if_capenable &= ~IFCAP_TSO4;
4232			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4233				ifp->if_capenable |= IFCAP_TSO4;
4234				ifp->if_hwassist |= CSUM_TSO;
4235			} else {
4236				printf("mxge requires tx checksum offload"
4237				       " be enabled to use TSO\n");
4238				err = EINVAL;
4239			}
4240		}
4241#if IFCAP_TSO6
4242		if (mask & IFCAP_TXCSUM_IPV6) {
4243			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4244				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4245						       | IFCAP_TSO6);
4246				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4247						      | CSUM_UDP);
4248			} else {
4249				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4250				ifp->if_hwassist |= (CSUM_TCP_IPV6
4251						     | CSUM_UDP_IPV6);
4252			}
4253		} else if (mask & IFCAP_RXCSUM_IPV6) {
4254			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4255				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4256			} else {
4257				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4258			}
4259		}
4260		if (mask & IFCAP_TSO6) {
4261			if (IFCAP_TSO6 & ifp->if_capenable) {
4262				ifp->if_capenable &= ~IFCAP_TSO6;
4263			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4264				ifp->if_capenable |= IFCAP_TSO6;
4265				ifp->if_hwassist |= CSUM_TSO;
4266			} else {
4267				printf("mxge requires tx checksum offload"
4268				       " be enabled to use TSO\n");
4269				err = EINVAL;
4270			}
4271		}
4272#endif /*IFCAP_TSO6 */
4273
4274		if (mask & IFCAP_LRO)
4275			ifp->if_capenable ^= IFCAP_LRO;
4276		if (mask & IFCAP_VLAN_HWTAGGING)
4277			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4278		if (mask & IFCAP_VLAN_HWTSO)
4279			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4280
4281		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4282		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4283			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4284
4285		mtx_unlock(&sc->driver_mtx);
4286		VLAN_CAPABILITIES(ifp);
4287
4288		break;
4289
4290	case SIOCGIFMEDIA:
4291		mtx_lock(&sc->driver_mtx);
4292		mxge_media_probe(sc);
4293		mtx_unlock(&sc->driver_mtx);
4294		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4295				    &sc->media, command);
4296                break;
4297
4298	default:
4299		err = ENOTTY;
4300        }
4301	return err;
4302}
4303
4304static void
4305mxge_fetch_tunables(mxge_softc_t *sc)
4306{
4307
4308	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4309	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4310			  &mxge_flow_control);
4311	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4312			  &mxge_intr_coal_delay);
4313	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4314			  &mxge_nvidia_ecrc_enable);
4315	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4316			  &mxge_force_firmware);
4317	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4318			  &mxge_deassert_wait);
4319	TUNABLE_INT_FETCH("hw.mxge.verbose",
4320			  &mxge_verbose);
4321	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4322	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4323	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4324	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4325	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4326	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4327
4328	if (bootverbose)
4329		mxge_verbose = 1;
4330	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4331		mxge_intr_coal_delay = 30;
4332	if (mxge_ticks == 0)
4333		mxge_ticks = hz / 2;
4334	sc->pause = mxge_flow_control;
4335	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4336	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4337		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4338	}
4339	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4340	    mxge_initial_mtu < ETHER_MIN_LEN)
4341		mxge_initial_mtu = ETHERMTU_JUMBO;
4342
4343	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4344		mxge_throttle = MXGE_MAX_THROTTLE;
4345	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4346		mxge_throttle = MXGE_MIN_THROTTLE;
4347	sc->throttle = mxge_throttle;
4348}
4349
4350
4351static void
4352mxge_free_slices(mxge_softc_t *sc)
4353{
4354	struct mxge_slice_state *ss;
4355	int i;
4356
4357
4358	if (sc->ss == NULL)
4359		return;
4360
4361	for (i = 0; i < sc->num_slices; i++) {
4362		ss = &sc->ss[i];
4363		if (ss->fw_stats != NULL) {
4364			mxge_dma_free(&ss->fw_stats_dma);
4365			ss->fw_stats = NULL;
4366#ifdef IFNET_BUF_RING
4367			if (ss->tx.br != NULL) {
4368				drbr_free(ss->tx.br, M_DEVBUF);
4369				ss->tx.br = NULL;
4370			}
4371#endif
4372			mtx_destroy(&ss->tx.mtx);
4373		}
4374		if (ss->rx_done.entry != NULL) {
4375			mxge_dma_free(&ss->rx_done.dma);
4376			ss->rx_done.entry = NULL;
4377		}
4378	}
4379	free(sc->ss, M_DEVBUF);
4380	sc->ss = NULL;
4381}
4382
4383static int
4384mxge_alloc_slices(mxge_softc_t *sc)
4385{
4386	mxge_cmd_t cmd;
4387	struct mxge_slice_state *ss;
4388	size_t bytes;
4389	int err, i, max_intr_slots;
4390
4391	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4392	if (err != 0) {
4393		device_printf(sc->dev, "Cannot determine rx ring size\n");
4394		return err;
4395	}
4396	sc->rx_ring_size = cmd.data0;
4397	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4398
4399	bytes = sizeof (*sc->ss) * sc->num_slices;
4400	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4401	if (sc->ss == NULL)
4402		return (ENOMEM);
4403	for (i = 0; i < sc->num_slices; i++) {
4404		ss = &sc->ss[i];
4405
4406		ss->sc = sc;
4407
4408		/* allocate per-slice rx interrupt queues */
4409
4410		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4411		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4412		if (err != 0)
4413			goto abort;
4414		ss->rx_done.entry = ss->rx_done.dma.addr;
4415		bzero(ss->rx_done.entry, bytes);
4416
4417		/*
4418		 * allocate the per-slice firmware stats; stats
4419		 * (including tx) are used used only on the first
4420		 * slice for now
4421		 */
4422#ifndef IFNET_BUF_RING
4423		if (i > 0)
4424			continue;
4425#endif
4426
4427		bytes = sizeof (*ss->fw_stats);
4428		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4429				     sizeof (*ss->fw_stats), 64);
4430		if (err != 0)
4431			goto abort;
4432		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4433		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4434			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4435		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4436#ifdef IFNET_BUF_RING
4437		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4438					   &ss->tx.mtx);
4439#endif
4440	}
4441
4442	return (0);
4443
4444abort:
4445	mxge_free_slices(sc);
4446	return (ENOMEM);
4447}
4448
4449static void
4450mxge_slice_probe(mxge_softc_t *sc)
4451{
4452	mxge_cmd_t cmd;
4453	char *old_fw;
4454	int msix_cnt, status, max_intr_slots;
4455
4456	sc->num_slices = 1;
4457	/*
4458	 *  don't enable multiple slices if they are not enabled,
4459	 *  or if this is not an SMP system
4460	 */
4461
4462	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4463		return;
4464
4465	/* see how many MSI-X interrupts are available */
4466	msix_cnt = pci_msix_count(sc->dev);
4467	if (msix_cnt < 2)
4468		return;
4469
4470	/* now load the slice aware firmware see what it supports */
4471	old_fw = sc->fw_name;
4472	if (old_fw == mxge_fw_aligned)
4473		sc->fw_name = mxge_fw_rss_aligned;
4474	else
4475		sc->fw_name = mxge_fw_rss_unaligned;
4476	status = mxge_load_firmware(sc, 0);
4477	if (status != 0) {
4478		device_printf(sc->dev, "Falling back to a single slice\n");
4479		return;
4480	}
4481
4482	/* try to send a reset command to the card to see if it
4483	   is alive */
4484	memset(&cmd, 0, sizeof (cmd));
4485	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4486	if (status != 0) {
4487		device_printf(sc->dev, "failed reset\n");
4488		goto abort_with_fw;
4489	}
4490
4491	/* get rx ring size */
4492	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4493	if (status != 0) {
4494		device_printf(sc->dev, "Cannot determine rx ring size\n");
4495		goto abort_with_fw;
4496	}
4497	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4498
4499	/* tell it the size of the interrupt queues */
4500	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4501	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4502	if (status != 0) {
4503		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4504		goto abort_with_fw;
4505	}
4506
4507	/* ask the maximum number of slices it supports */
4508	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4509	if (status != 0) {
4510		device_printf(sc->dev,
4511			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4512		goto abort_with_fw;
4513	}
4514	sc->num_slices = cmd.data0;
4515	if (sc->num_slices > msix_cnt)
4516		sc->num_slices = msix_cnt;
4517
4518	if (mxge_max_slices == -1) {
4519		/* cap to number of CPUs in system */
4520		if (sc->num_slices > mp_ncpus)
4521			sc->num_slices = mp_ncpus;
4522	} else {
4523		if (sc->num_slices > mxge_max_slices)
4524			sc->num_slices = mxge_max_slices;
4525	}
4526	/* make sure it is a power of two */
4527	while (sc->num_slices & (sc->num_slices - 1))
4528		sc->num_slices--;
4529
4530	if (mxge_verbose)
4531		device_printf(sc->dev, "using %d slices\n",
4532			      sc->num_slices);
4533
4534	return;
4535
4536abort_with_fw:
4537	sc->fw_name = old_fw;
4538	(void) mxge_load_firmware(sc, 0);
4539}
4540
4541static int
4542mxge_add_msix_irqs(mxge_softc_t *sc)
4543{
4544	size_t bytes;
4545	int count, err, i, rid;
4546
4547	rid = PCIR_BAR(2);
4548	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4549						    &rid, RF_ACTIVE);
4550
4551	if (sc->msix_table_res == NULL) {
4552		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4553		return ENXIO;
4554	}
4555
4556	count = sc->num_slices;
4557	err = pci_alloc_msix(sc->dev, &count);
4558	if (err != 0) {
4559		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4560			      "err = %d \n", sc->num_slices, err);
4561		goto abort_with_msix_table;
4562	}
4563	if (count < sc->num_slices) {
4564		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4565			      count, sc->num_slices);
4566		device_printf(sc->dev,
4567			      "Try setting hw.mxge.max_slices to %d\n",
4568			      count);
4569		err = ENOSPC;
4570		goto abort_with_msix;
4571	}
4572	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4573	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4574	if (sc->msix_irq_res == NULL) {
4575		err = ENOMEM;
4576		goto abort_with_msix;
4577	}
4578
4579	for (i = 0; i < sc->num_slices; i++) {
4580		rid = i + 1;
4581		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4582							  SYS_RES_IRQ,
4583							  &rid, RF_ACTIVE);
4584		if (sc->msix_irq_res[i] == NULL) {
4585			device_printf(sc->dev, "couldn't allocate IRQ res"
4586				      " for message %d\n", i);
4587			err = ENXIO;
4588			goto abort_with_res;
4589		}
4590	}
4591
4592	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4593	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4594
4595	for (i = 0; i < sc->num_slices; i++) {
4596		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4597				     INTR_TYPE_NET | INTR_MPSAFE,
4598#if __FreeBSD_version > 700030
4599				     NULL,
4600#endif
4601				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4602		if (err != 0) {
4603			device_printf(sc->dev, "couldn't setup intr for "
4604				      "message %d\n", i);
4605			goto abort_with_intr;
4606		}
4607		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4608				  sc->msix_ih[i], "s%d", i);
4609	}
4610
4611	if (mxge_verbose) {
4612		device_printf(sc->dev, "using %d msix IRQs:",
4613			      sc->num_slices);
4614		for (i = 0; i < sc->num_slices; i++)
4615			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4616		printf("\n");
4617	}
4618	return (0);
4619
4620abort_with_intr:
4621	for (i = 0; i < sc->num_slices; i++) {
4622		if (sc->msix_ih[i] != NULL) {
4623			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4624					  sc->msix_ih[i]);
4625			sc->msix_ih[i] = NULL;
4626		}
4627	}
4628	free(sc->msix_ih, M_DEVBUF);
4629
4630
4631abort_with_res:
4632	for (i = 0; i < sc->num_slices; i++) {
4633		rid = i + 1;
4634		if (sc->msix_irq_res[i] != NULL)
4635			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4636					     sc->msix_irq_res[i]);
4637		sc->msix_irq_res[i] = NULL;
4638	}
4639	free(sc->msix_irq_res, M_DEVBUF);
4640
4641
4642abort_with_msix:
4643	pci_release_msi(sc->dev);
4644
4645abort_with_msix_table:
4646	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4647			     sc->msix_table_res);
4648
4649	return err;
4650}
4651
4652static int
4653mxge_add_single_irq(mxge_softc_t *sc)
4654{
4655	int count, err, rid;
4656
4657	count = pci_msi_count(sc->dev);
4658	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4659		rid = 1;
4660	} else {
4661		rid = 0;
4662		sc->legacy_irq = 1;
4663	}
4664	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4665					 1, RF_SHAREABLE | RF_ACTIVE);
4666	if (sc->irq_res == NULL) {
4667		device_printf(sc->dev, "could not alloc interrupt\n");
4668		return ENXIO;
4669	}
4670	if (mxge_verbose)
4671		device_printf(sc->dev, "using %s irq %ld\n",
4672			      sc->legacy_irq ? "INTx" : "MSI",
4673			      rman_get_start(sc->irq_res));
4674	err = bus_setup_intr(sc->dev, sc->irq_res,
4675			     INTR_TYPE_NET | INTR_MPSAFE,
4676#if __FreeBSD_version > 700030
4677			     NULL,
4678#endif
4679			     mxge_intr, &sc->ss[0], &sc->ih);
4680	if (err != 0) {
4681		bus_release_resource(sc->dev, SYS_RES_IRQ,
4682				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4683		if (!sc->legacy_irq)
4684			pci_release_msi(sc->dev);
4685	}
4686	return err;
4687}
4688
4689static void
4690mxge_rem_msix_irqs(mxge_softc_t *sc)
4691{
4692	int i, rid;
4693
4694	for (i = 0; i < sc->num_slices; i++) {
4695		if (sc->msix_ih[i] != NULL) {
4696			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4697					  sc->msix_ih[i]);
4698			sc->msix_ih[i] = NULL;
4699		}
4700	}
4701	free(sc->msix_ih, M_DEVBUF);
4702
4703	for (i = 0; i < sc->num_slices; i++) {
4704		rid = i + 1;
4705		if (sc->msix_irq_res[i] != NULL)
4706			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4707					     sc->msix_irq_res[i]);
4708		sc->msix_irq_res[i] = NULL;
4709	}
4710	free(sc->msix_irq_res, M_DEVBUF);
4711
4712	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4713			     sc->msix_table_res);
4714
4715	pci_release_msi(sc->dev);
4716	return;
4717}
4718
4719static void
4720mxge_rem_single_irq(mxge_softc_t *sc)
4721{
4722	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4723	bus_release_resource(sc->dev, SYS_RES_IRQ,
4724			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4725	if (!sc->legacy_irq)
4726		pci_release_msi(sc->dev);
4727}
4728
4729static void
4730mxge_rem_irq(mxge_softc_t *sc)
4731{
4732	if (sc->num_slices > 1)
4733		mxge_rem_msix_irqs(sc);
4734	else
4735		mxge_rem_single_irq(sc);
4736}
4737
4738static int
4739mxge_add_irq(mxge_softc_t *sc)
4740{
4741	int err;
4742
4743	if (sc->num_slices > 1)
4744		err = mxge_add_msix_irqs(sc);
4745	else
4746		err = mxge_add_single_irq(sc);
4747
4748	if (0 && err == 0 && sc->num_slices > 1) {
4749		mxge_rem_msix_irqs(sc);
4750		err = mxge_add_msix_irqs(sc);
4751	}
4752	return err;
4753}
4754
4755
4756static int
4757mxge_attach(device_t dev)
4758{
4759	mxge_cmd_t cmd;
4760	mxge_softc_t *sc = device_get_softc(dev);
4761	struct ifnet *ifp;
4762	int err, rid;
4763
4764	sc->dev = dev;
4765	mxge_fetch_tunables(sc);
4766
4767	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4768	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4769				  taskqueue_thread_enqueue, &sc->tq);
4770	if (sc->tq == NULL) {
4771		err = ENOMEM;
4772		goto abort_with_nothing;
4773	}
4774
4775	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4776				 1,			/* alignment */
4777				 0,			/* boundary */
4778				 BUS_SPACE_MAXADDR,	/* low */
4779				 BUS_SPACE_MAXADDR,	/* high */
4780				 NULL, NULL,		/* filter */
4781				 65536 + 256,		/* maxsize */
4782				 MXGE_MAX_SEND_DESC, 	/* num segs */
4783				 65536,			/* maxsegsize */
4784				 0,			/* flags */
4785				 NULL, NULL,		/* lock */
4786				 &sc->parent_dmat);	/* tag */
4787
4788	if (err != 0) {
4789		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4790			      err);
4791		goto abort_with_tq;
4792	}
4793
4794	ifp = sc->ifp = if_alloc(IFT_ETHER);
4795	if (ifp == NULL) {
4796		device_printf(dev, "can not if_alloc()\n");
4797		err = ENOSPC;
4798		goto abort_with_parent_dmat;
4799	}
4800	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4801
4802	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4803		 device_get_nameunit(dev));
4804	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4805	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4806		 "%s:drv", device_get_nameunit(dev));
4807	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4808		 MTX_NETWORK_LOCK, MTX_DEF);
4809
4810	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4811
4812	mxge_setup_cfg_space(sc);
4813
4814	/* Map the board into the kernel */
4815	rid = PCIR_BARS;
4816	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4817					 ~0, 1, RF_ACTIVE);
4818	if (sc->mem_res == NULL) {
4819		device_printf(dev, "could not map memory\n");
4820		err = ENXIO;
4821		goto abort_with_lock;
4822	}
4823	sc->sram = rman_get_virtual(sc->mem_res);
4824	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4825	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4826		device_printf(dev, "impossible memory region size %ld\n",
4827			      rman_get_size(sc->mem_res));
4828		err = ENXIO;
4829		goto abort_with_mem_res;
4830	}
4831
4832	/* make NULL terminated copy of the EEPROM strings section of
4833	   lanai SRAM */
4834	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4835	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4836				rman_get_bushandle(sc->mem_res),
4837				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4838				sc->eeprom_strings,
4839				MXGE_EEPROM_STRINGS_SIZE - 2);
4840	err = mxge_parse_strings(sc);
4841	if (err != 0)
4842		goto abort_with_mem_res;
4843
4844	/* Enable write combining for efficient use of PCIe bus */
4845	mxge_enable_wc(sc);
4846
4847	/* Allocate the out of band dma memory */
4848	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4849			     sizeof (mxge_cmd_t), 64);
4850	if (err != 0)
4851		goto abort_with_mem_res;
4852	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4853	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4854	if (err != 0)
4855		goto abort_with_cmd_dma;
4856
4857	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4858	if (err != 0)
4859		goto abort_with_zeropad_dma;
4860
4861	/* select & load the firmware */
4862	err = mxge_select_firmware(sc);
4863	if (err != 0)
4864		goto abort_with_dmabench;
4865	sc->intr_coal_delay = mxge_intr_coal_delay;
4866
4867	mxge_slice_probe(sc);
4868	err = mxge_alloc_slices(sc);
4869	if (err != 0)
4870		goto abort_with_dmabench;
4871
4872	err = mxge_reset(sc, 0);
4873	if (err != 0)
4874		goto abort_with_slices;
4875
4876	err = mxge_alloc_rings(sc);
4877	if (err != 0) {
4878		device_printf(sc->dev, "failed to allocate rings\n");
4879		goto abort_with_slices;
4880	}
4881
4882	err = mxge_add_irq(sc);
4883	if (err != 0) {
4884		device_printf(sc->dev, "failed to add irq\n");
4885		goto abort_with_rings;
4886	}
4887
4888	ifp->if_baudrate = IF_Gbps(10UL);
4889	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4890		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4891		IFCAP_RXCSUM_IPV6;
4892#if defined(INET) || defined(INET6)
4893	ifp->if_capabilities |= IFCAP_LRO;
4894#endif
4895
4896#ifdef MXGE_NEW_VLAN_API
4897	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4898
4899	/* Only FW 1.4.32 and newer can do TSO over vlans */
4900	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4901	    sc->fw_ver_tiny >= 32)
4902		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4903#endif
4904	sc->max_mtu = mxge_max_mtu(sc);
4905	if (sc->max_mtu >= 9000)
4906		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4907	else
4908		device_printf(dev, "MTU limited to %d.  Install "
4909			      "latest firmware for 9000 byte jumbo support\n",
4910			      sc->max_mtu - ETHER_HDR_LEN);
4911	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4912	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4913	/* check to see if f/w supports TSO for IPv6 */
4914	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4915		if (CSUM_TCP_IPV6)
4916			ifp->if_capabilities |= IFCAP_TSO6;
4917		sc->max_tso6_hlen = min(cmd.data0,
4918					sizeof (sc->ss[0].scratch));
4919	}
4920	ifp->if_capenable = ifp->if_capabilities;
4921	if (sc->lro_cnt == 0)
4922		ifp->if_capenable &= ~IFCAP_LRO;
4923        ifp->if_init = mxge_init;
4924        ifp->if_softc = sc;
4925        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4926        ifp->if_ioctl = mxge_ioctl;
4927        ifp->if_start = mxge_start;
4928	/* Initialise the ifmedia structure */
4929	ifmedia_init(&sc->media, 0, mxge_media_change,
4930		     mxge_media_status);
4931	mxge_media_init(sc);
4932	mxge_media_probe(sc);
4933	sc->dying = 0;
4934	ether_ifattach(ifp, sc->mac_addr);
4935	/* ether_ifattach sets mtu to ETHERMTU */
4936	if (mxge_initial_mtu != ETHERMTU)
4937		mxge_change_mtu(sc, mxge_initial_mtu);
4938
4939	mxge_add_sysctls(sc);
4940#ifdef IFNET_BUF_RING
4941	ifp->if_transmit = mxge_transmit;
4942	ifp->if_qflush = mxge_qflush;
4943#endif
4944	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4945				device_get_nameunit(sc->dev));
4946	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4947	return 0;
4948
4949abort_with_rings:
4950	mxge_free_rings(sc);
4951abort_with_slices:
4952	mxge_free_slices(sc);
4953abort_with_dmabench:
4954	mxge_dma_free(&sc->dmabench_dma);
4955abort_with_zeropad_dma:
4956	mxge_dma_free(&sc->zeropad_dma);
4957abort_with_cmd_dma:
4958	mxge_dma_free(&sc->cmd_dma);
4959abort_with_mem_res:
4960	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4961abort_with_lock:
4962	pci_disable_busmaster(dev);
4963	mtx_destroy(&sc->cmd_mtx);
4964	mtx_destroy(&sc->driver_mtx);
4965	if_free(ifp);
4966abort_with_parent_dmat:
4967	bus_dma_tag_destroy(sc->parent_dmat);
4968abort_with_tq:
4969	if (sc->tq != NULL) {
4970		taskqueue_drain(sc->tq, &sc->watchdog_task);
4971		taskqueue_free(sc->tq);
4972		sc->tq = NULL;
4973	}
4974abort_with_nothing:
4975	return err;
4976}
4977
4978static int
4979mxge_detach(device_t dev)
4980{
4981	mxge_softc_t *sc = device_get_softc(dev);
4982
4983	if (mxge_vlans_active(sc)) {
4984		device_printf(sc->dev,
4985			      "Detach vlans before removing module\n");
4986		return EBUSY;
4987	}
4988	mtx_lock(&sc->driver_mtx);
4989	sc->dying = 1;
4990	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4991		mxge_close(sc, 0);
4992	mtx_unlock(&sc->driver_mtx);
4993	ether_ifdetach(sc->ifp);
4994	if (sc->tq != NULL) {
4995		taskqueue_drain(sc->tq, &sc->watchdog_task);
4996		taskqueue_free(sc->tq);
4997		sc->tq = NULL;
4998	}
4999	callout_drain(&sc->co_hdl);
5000	ifmedia_removeall(&sc->media);
5001	mxge_dummy_rdma(sc, 0);
5002	mxge_rem_sysctls(sc);
5003	mxge_rem_irq(sc);
5004	mxge_free_rings(sc);
5005	mxge_free_slices(sc);
5006	mxge_dma_free(&sc->dmabench_dma);
5007	mxge_dma_free(&sc->zeropad_dma);
5008	mxge_dma_free(&sc->cmd_dma);
5009	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5010	pci_disable_busmaster(dev);
5011	mtx_destroy(&sc->cmd_mtx);
5012	mtx_destroy(&sc->driver_mtx);
5013	if_free(sc->ifp);
5014	bus_dma_tag_destroy(sc->parent_dmat);
5015	return 0;
5016}
5017
5018static int
5019mxge_shutdown(device_t dev)
5020{
5021	return 0;
5022}
5023
5024/*
5025  This file uses Myri10GE driver indentation.
5026
5027  Local Variables:
5028  c-file-style:"linux"
5029  tab-width:8
5030  End:
5031*/
5032