1/******************************************************************************
2
3Copyright (c) 2006-2013, Myricom Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Myricom Inc, nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: stable/10/sys/dev/mxge/if_mxge.c 329834 2018-02-22 19:40:03Z rpokala $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/linker.h>
36#include <sys/firmware.h>
37#include <sys/endian.h>
38#include <sys/sockio.h>
39#include <sys/mbuf.h>
40#include <sys/malloc.h>
41#include <sys/kdb.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/module.h>
45#include <sys/socket.h>
46#include <sys/sysctl.h>
47#include <sys/sx.h>
48#include <sys/taskqueue.h>
49
50#include <net/if.h>
51#include <net/if_arp.h>
52#include <net/ethernet.h>
53#include <net/if_dl.h>
54#include <net/if_media.h>
55
56#include <net/bpf.h>
57
58#include <net/if_types.h>
59#include <net/if_vlan_var.h>
60#include <net/zlib.h>
61
62#include <netinet/in_systm.h>
63#include <netinet/in.h>
64#include <netinet/ip.h>
65#include <netinet/ip6.h>
66#include <netinet/tcp.h>
67#include <netinet/tcp_lro.h>
68#include <netinet6/ip6_var.h>
69
70#include <machine/bus.h>
71#include <machine/in_cksum.h>
72#include <machine/resource.h>
73#include <sys/bus.h>
74#include <sys/rman.h>
75#include <sys/smp.h>
76
77#include <dev/pci/pcireg.h>
78#include <dev/pci/pcivar.h>
79#include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
80
81#include <vm/vm.h>		/* for pmap_mapdev() */
82#include <vm/pmap.h>
83
84#if defined(__i386) || defined(__amd64)
85#include <machine/specialreg.h>
86#endif
87
88#include <dev/mxge/mxge_mcp.h>
89#include <dev/mxge/mcp_gen_header.h>
90/*#define MXGE_FAKE_IFP*/
91#include <dev/mxge/if_mxge_var.h>
92#ifdef IFNET_BUF_RING
93#include <sys/buf_ring.h>
94#endif
95
96#include "opt_inet.h"
97#include "opt_inet6.h"
98
99/* tunable params */
100static int mxge_nvidia_ecrc_enable = 1;
101static int mxge_force_firmware = 0;
102static int mxge_intr_coal_delay = 30;
103static int mxge_deassert_wait = 1;
104static int mxge_flow_control = 1;
105static int mxge_verbose = 0;
106static int mxge_ticks;
107static int mxge_max_slices = 1;
108static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
109static int mxge_always_promisc = 0;
110static int mxge_initial_mtu = ETHERMTU_JUMBO;
111static int mxge_throttle = 0;
112static char *mxge_fw_unaligned = "mxge_ethp_z8e";
113static char *mxge_fw_aligned = "mxge_eth_z8e";
114static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
115static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
116
117static int mxge_probe(device_t dev);
118static int mxge_attach(device_t dev);
119static int mxge_detach(device_t dev);
120static int mxge_shutdown(device_t dev);
121static void mxge_intr(void *arg);
122
123static device_method_t mxge_methods[] =
124{
125  /* Device interface */
126  DEVMETHOD(device_probe, mxge_probe),
127  DEVMETHOD(device_attach, mxge_attach),
128  DEVMETHOD(device_detach, mxge_detach),
129  DEVMETHOD(device_shutdown, mxge_shutdown),
130
131  DEVMETHOD_END
132};
133
134static driver_t mxge_driver =
135{
136  "mxge",
137  mxge_methods,
138  sizeof(mxge_softc_t),
139};
140
141static devclass_t mxge_devclass;
142
143/* Declare ourselves to be a child of the PCI bus.*/
144DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
145MODULE_DEPEND(mxge, firmware, 1, 1, 1);
146MODULE_DEPEND(mxge, zlib, 1, 1, 1);
147
148static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
149static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
150static int mxge_close(mxge_softc_t *sc, int down);
151static int mxge_open(mxge_softc_t *sc);
152static void mxge_tick(void *arg);
153
154static int
155mxge_probe(device_t dev)
156{
157	int rev;
158
159
160	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
161	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
162	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
163		rev = pci_get_revid(dev);
164		switch (rev) {
165		case MXGE_PCI_REV_Z8E:
166			device_set_desc(dev, "Myri10G-PCIE-8A");
167			break;
168		case MXGE_PCI_REV_Z8ES:
169			device_set_desc(dev, "Myri10G-PCIE-8B");
170			break;
171		default:
172			device_set_desc(dev, "Myri10G-PCIE-8??");
173			device_printf(dev, "Unrecognized rev %d NIC\n",
174				      rev);
175			break;
176		}
177		return 0;
178	}
179	return ENXIO;
180}
181
182static void
183mxge_enable_wc(mxge_softc_t *sc)
184{
185#if defined(__i386) || defined(__amd64)
186	vm_offset_t len;
187	int err;
188
189	sc->wc = 1;
190	len = rman_get_size(sc->mem_res);
191	err = pmap_change_attr((vm_offset_t) sc->sram,
192			       len, PAT_WRITE_COMBINING);
193	if (err != 0) {
194		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
195			      err);
196		sc->wc = 0;
197	}
198#endif
199}
200
201
202/* callback to get our DMA address */
203static void
204mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
205			 int error)
206{
207	if (error == 0) {
208		*(bus_addr_t *) arg = segs->ds_addr;
209	}
210}
211
212static int
213mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
214		   bus_size_t alignment)
215{
216	int err;
217	device_t dev = sc->dev;
218	bus_size_t boundary, maxsegsize;
219
220	if (bytes > 4096 && alignment == 4096) {
221		boundary = 0;
222		maxsegsize = bytes;
223	} else {
224		boundary = 4096;
225		maxsegsize = 4096;
226	}
227
228	/* allocate DMAable memory tags */
229	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
230				 alignment,		/* alignment */
231				 boundary,		/* boundary */
232				 BUS_SPACE_MAXADDR,	/* low */
233				 BUS_SPACE_MAXADDR,	/* high */
234				 NULL, NULL,		/* filter */
235				 bytes,			/* maxsize */
236				 1,			/* num segs */
237				 maxsegsize,		/* maxsegsize */
238				 BUS_DMA_COHERENT,	/* flags */
239				 NULL, NULL,		/* lock */
240				 &dma->dmat);		/* tag */
241	if (err != 0) {
242		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
243		return err;
244	}
245
246	/* allocate DMAable memory & map */
247	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
248			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
249				| BUS_DMA_ZERO),  &dma->map);
250	if (err != 0) {
251		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
252		goto abort_with_dmat;
253	}
254
255	/* load the memory */
256	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
257			      mxge_dmamap_callback,
258			      (void *)&dma->bus_addr, 0);
259	if (err != 0) {
260		device_printf(dev, "couldn't load map (err = %d)\n", err);
261		goto abort_with_mem;
262	}
263	return 0;
264
265abort_with_mem:
266	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
267abort_with_dmat:
268	(void)bus_dma_tag_destroy(dma->dmat);
269	return err;
270}
271
272
273static void
274mxge_dma_free(mxge_dma_t *dma)
275{
276	bus_dmamap_unload(dma->dmat, dma->map);
277	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
278	(void)bus_dma_tag_destroy(dma->dmat);
279}
280
281/*
282 * The eeprom strings on the lanaiX have the format
283 * SN=x\0
284 * MAC=x:x:x:x:x:x\0
285 * PC=text\0
286 */
287
288static int
289mxge_parse_strings(mxge_softc_t *sc)
290{
291	char *ptr;
292	int i, found_mac, found_sn2;
293	char *endptr;
294
295	ptr = sc->eeprom_strings;
296	found_mac = 0;
297	found_sn2 = 0;
298	while (*ptr != '\0') {
299		if (strncmp(ptr, "MAC=", 4) == 0) {
300			ptr += 4;
301			for (i = 0;;) {
302				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
303				if (endptr - ptr != 2)
304					goto abort;
305				ptr = endptr;
306				if (++i == 6)
307					break;
308				if (*ptr++ != ':')
309					goto abort;
310			}
311			found_mac = 1;
312		} else if (strncmp(ptr, "PC=", 3) == 0) {
313			ptr += 3;
314			strlcpy(sc->product_code_string, ptr,
315			    sizeof(sc->product_code_string));
316		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
317			ptr += 3;
318			strlcpy(sc->serial_number_string, ptr,
319			    sizeof(sc->serial_number_string));
320		} else if (strncmp(ptr, "SN2=", 4) == 0) {
321			/* SN2 takes precedence over SN */
322			ptr += 4;
323			found_sn2 = 1;
324			strlcpy(sc->serial_number_string, ptr,
325			    sizeof(sc->serial_number_string));
326		}
327		while (*ptr++ != '\0') {}
328	}
329
330	if (found_mac)
331		return 0;
332
333 abort:
334	device_printf(sc->dev, "failed to parse eeprom_strings\n");
335
336	return ENXIO;
337}
338
339#if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
340static void
341mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
342{
343	uint32_t val;
344	unsigned long base, off;
345	char *va, *cfgptr;
346	device_t pdev, mcp55;
347	uint16_t vendor_id, device_id, word;
348	uintptr_t bus, slot, func, ivend, idev;
349	uint32_t *ptr32;
350
351
352	if (!mxge_nvidia_ecrc_enable)
353		return;
354
355	pdev = device_get_parent(device_get_parent(sc->dev));
356	if (pdev == NULL) {
357		device_printf(sc->dev, "could not find parent?\n");
358		return;
359	}
360	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
361	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
362
363	if (vendor_id != 0x10de)
364		return;
365
366	base = 0;
367
368	if (device_id == 0x005d) {
369		/* ck804, base address is magic */
370		base = 0xe0000000UL;
371	} else if (device_id >= 0x0374 && device_id <= 0x378) {
372		/* mcp55, base address stored in chipset */
373		mcp55 = pci_find_bsf(0, 0, 0);
374		if (mcp55 &&
375		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
376		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
377			word = pci_read_config(mcp55, 0x90, 2);
378			base = ((unsigned long)word & 0x7ffeU) << 25;
379		}
380	}
381	if (!base)
382		return;
383
384	/* XXXX
385	   Test below is commented because it is believed that doing
386	   config read/write beyond 0xff will access the config space
387	   for the next larger function.  Uncomment this and remove
388	   the hacky pmap_mapdev() way of accessing config space when
389	   FreeBSD grows support for extended pcie config space access
390	*/
391#if 0
392	/* See if we can, by some miracle, access the extended
393	   config space */
394	val = pci_read_config(pdev, 0x178, 4);
395	if (val != 0xffffffff) {
396		val |= 0x40;
397		pci_write_config(pdev, 0x178, val, 4);
398		return;
399	}
400#endif
401	/* Rather than using normal pci config space writes, we must
402	 * map the Nvidia config space ourselves.  This is because on
403	 * opteron/nvidia class machine the 0xe000000 mapping is
404	 * handled by the nvidia chipset, that means the internal PCI
405	 * device (the on-chip northbridge), or the amd-8131 bridge
406	 * and things behind them are not visible by this method.
407	 */
408
409	BUS_READ_IVAR(device_get_parent(pdev), pdev,
410		      PCI_IVAR_BUS, &bus);
411	BUS_READ_IVAR(device_get_parent(pdev), pdev,
412		      PCI_IVAR_SLOT, &slot);
413	BUS_READ_IVAR(device_get_parent(pdev), pdev,
414		      PCI_IVAR_FUNCTION, &func);
415	BUS_READ_IVAR(device_get_parent(pdev), pdev,
416		      PCI_IVAR_VENDOR, &ivend);
417	BUS_READ_IVAR(device_get_parent(pdev), pdev,
418		      PCI_IVAR_DEVICE, &idev);
419
420	off =  base
421		+ 0x00100000UL * (unsigned long)bus
422		+ 0x00001000UL * (unsigned long)(func
423						 + 8 * slot);
424
425	/* map it into the kernel */
426	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
427
428
429	if (va == NULL) {
430		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
431		return;
432	}
433	/* get a pointer to the config space mapped into the kernel */
434	cfgptr = va + (off & PAGE_MASK);
435
436	/* make sure that we can really access it */
437	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
438	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
439	if (! (vendor_id == ivend && device_id == idev)) {
440		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
441			      vendor_id, device_id);
442		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
443		return;
444	}
445
446	ptr32 = (uint32_t*)(cfgptr + 0x178);
447	val = *ptr32;
448
449	if (val == 0xffffffff) {
450		device_printf(sc->dev, "extended mapping failed\n");
451		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
452		return;
453	}
454	*ptr32 = val | 0x40;
455	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
456	if (mxge_verbose)
457		device_printf(sc->dev,
458			      "Enabled ECRC on upstream Nvidia bridge "
459			      "at %d:%d:%d\n",
460			      (int)bus, (int)slot, (int)func);
461	return;
462}
463#else
464static void
465mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
466{
467	device_printf(sc->dev,
468		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
469	return;
470}
471#endif
472
473
474static int
475mxge_dma_test(mxge_softc_t *sc, int test_type)
476{
477	mxge_cmd_t cmd;
478	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
479	int status;
480	uint32_t len;
481	char *test = " ";
482
483
484	/* Run a small DMA test.
485	 * The magic multipliers to the length tell the firmware
486	 * to do DMA read, write, or read+write tests.  The
487	 * results are returned in cmd.data0.  The upper 16
488	 * bits of the return is the number of transfers completed.
489	 * The lower 16 bits is the time in 0.5us ticks that the
490	 * transfers took to complete.
491	 */
492
493	len = sc->tx_boundary;
494
495	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
496	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
497	cmd.data2 = len * 0x10000;
498	status = mxge_send_cmd(sc, test_type, &cmd);
499	if (status != 0) {
500		test = "read";
501		goto abort;
502	}
503	sc->read_dma = ((cmd.data0>>16) * len * 2) /
504		(cmd.data0 & 0xffff);
505	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
506	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
507	cmd.data2 = len * 0x1;
508	status = mxge_send_cmd(sc, test_type, &cmd);
509	if (status != 0) {
510		test = "write";
511		goto abort;
512	}
513	sc->write_dma = ((cmd.data0>>16) * len * 2) /
514		(cmd.data0 & 0xffff);
515
516	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
517	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
518	cmd.data2 = len * 0x10001;
519	status = mxge_send_cmd(sc, test_type, &cmd);
520	if (status != 0) {
521		test = "read/write";
522		goto abort;
523	}
524	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
525		(cmd.data0 & 0xffff);
526
527abort:
528	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
529		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
530			      test, status);
531
532	return status;
533}
534
535/*
536 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
537 * when the PCI-E Completion packets are aligned on an 8-byte
538 * boundary.  Some PCI-E chip sets always align Completion packets; on
539 * the ones that do not, the alignment can be enforced by enabling
540 * ECRC generation (if supported).
541 *
542 * When PCI-E Completion packets are not aligned, it is actually more
543 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
544 *
545 * If the driver can neither enable ECRC nor verify that it has
546 * already been enabled, then it must use a firmware image which works
547 * around unaligned completion packets (ethp_z8e.dat), and it should
548 * also ensure that it never gives the device a Read-DMA which is
549 * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
550 * enabled, then the driver should use the aligned (eth_z8e.dat)
551 * firmware image, and set tx_boundary to 4KB.
552 */
553
554static int
555mxge_firmware_probe(mxge_softc_t *sc)
556{
557	device_t dev = sc->dev;
558	int reg, status;
559	uint16_t pectl;
560
561	sc->tx_boundary = 4096;
562	/*
563	 * Verify the max read request size was set to 4KB
564	 * before trying the test with 4KB.
565	 */
566	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
567		pectl = pci_read_config(dev, reg + 0x8, 2);
568		if ((pectl & (5 << 12)) != (5 << 12)) {
569			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
570				      pectl);
571			sc->tx_boundary = 2048;
572		}
573	}
574
575	/*
576	 * load the optimized firmware (which assumes aligned PCIe
577	 * completions) in order to see if it works on this host.
578	 */
579	sc->fw_name = mxge_fw_aligned;
580	status = mxge_load_firmware(sc, 1);
581	if (status != 0) {
582		return status;
583	}
584
585	/*
586	 * Enable ECRC if possible
587	 */
588	mxge_enable_nvidia_ecrc(sc);
589
590	/*
591	 * Run a DMA test which watches for unaligned completions and
592	 * aborts on the first one seen.  Not required on Z8ES or newer.
593	 */
594	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
595		return 0;
596	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
597	if (status == 0)
598		return 0; /* keep the aligned firmware */
599
600	if (status != E2BIG)
601		device_printf(dev, "DMA test failed: %d\n", status);
602	if (status == ENOSYS)
603		device_printf(dev, "Falling back to ethp! "
604			      "Please install up to date fw\n");
605	return status;
606}
607
608static int
609mxge_select_firmware(mxge_softc_t *sc)
610{
611	int aligned = 0;
612	int force_firmware = mxge_force_firmware;
613
614	if (sc->throttle)
615		force_firmware = sc->throttle;
616
617	if (force_firmware != 0) {
618		if (force_firmware == 1)
619			aligned = 1;
620		else
621			aligned = 0;
622		if (mxge_verbose)
623			device_printf(sc->dev,
624				      "Assuming %s completions (forced)\n",
625				      aligned ? "aligned" : "unaligned");
626		goto abort;
627	}
628
629	/* if the PCIe link width is 4 or less, we can use the aligned
630	   firmware and skip any checks */
631	if (sc->link_width != 0 && sc->link_width <= 4) {
632		device_printf(sc->dev,
633			      "PCIe x%d Link, expect reduced performance\n",
634			      sc->link_width);
635		aligned = 1;
636		goto abort;
637	}
638
639	if (0 == mxge_firmware_probe(sc))
640		return 0;
641
642abort:
643	if (aligned) {
644		sc->fw_name = mxge_fw_aligned;
645		sc->tx_boundary = 4096;
646	} else {
647		sc->fw_name = mxge_fw_unaligned;
648		sc->tx_boundary = 2048;
649	}
650	return (mxge_load_firmware(sc, 0));
651}
652
653static int
654mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
655{
656
657
658	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
659		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
660			      be32toh(hdr->mcp_type));
661		return EIO;
662	}
663
664	/* save firmware version for sysctl */
665	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
666	if (mxge_verbose)
667		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
668
669	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
670	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
671
672	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
673	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
674		device_printf(sc->dev, "Found firmware version %s\n",
675			      sc->fw_version);
676		device_printf(sc->dev, "Driver needs %d.%d\n",
677			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
678		return EINVAL;
679	}
680	return 0;
681
682}
683
684static void *
685z_alloc(void *nil, u_int items, u_int size)
686{
687        void *ptr;
688
689        ptr = malloc(items * size, M_TEMP, M_NOWAIT);
690        return ptr;
691}
692
693static void
694z_free(void *nil, void *ptr)
695{
696        free(ptr, M_TEMP);
697}
698
699
700static int
701mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
702{
703	z_stream zs;
704	char *inflate_buffer;
705	const struct firmware *fw;
706	const mcp_gen_header_t *hdr;
707	unsigned hdr_offset;
708	int status;
709	unsigned int i;
710	char dummy;
711	size_t fw_len;
712
713	fw = firmware_get(sc->fw_name);
714	if (fw == NULL) {
715		device_printf(sc->dev, "Could not find firmware image %s\n",
716			      sc->fw_name);
717		return ENOENT;
718	}
719
720
721
722	/* setup zlib and decompress f/w */
723	bzero(&zs, sizeof (zs));
724	zs.zalloc = z_alloc;
725	zs.zfree = z_free;
726	status = inflateInit(&zs);
727	if (status != Z_OK) {
728		status = EIO;
729		goto abort_with_fw;
730	}
731
732	/* the uncompressed size is stored as the firmware version,
733	   which would otherwise go unused */
734	fw_len = (size_t) fw->version;
735	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
736	if (inflate_buffer == NULL)
737		goto abort_with_zs;
738	zs.avail_in = fw->datasize;
739	zs.next_in = __DECONST(char *, fw->data);
740	zs.avail_out = fw_len;
741	zs.next_out = inflate_buffer;
742	status = inflate(&zs, Z_FINISH);
743	if (status != Z_STREAM_END) {
744		device_printf(sc->dev, "zlib %d\n", status);
745		status = EIO;
746		goto abort_with_buffer;
747	}
748
749	/* check id */
750	hdr_offset = htobe32(*(const uint32_t *)
751			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
752	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
753		device_printf(sc->dev, "Bad firmware file");
754		status = EIO;
755		goto abort_with_buffer;
756	}
757	hdr = (const void*)(inflate_buffer + hdr_offset);
758
759	status = mxge_validate_firmware(sc, hdr);
760	if (status != 0)
761		goto abort_with_buffer;
762
763	/* Copy the inflated firmware to NIC SRAM. */
764	for (i = 0; i < fw_len; i += 256) {
765		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
766			      inflate_buffer + i,
767			      min(256U, (unsigned)(fw_len - i)));
768		wmb();
769		dummy = *sc->sram;
770		wmb();
771	}
772
773	*limit = fw_len;
774	status = 0;
775abort_with_buffer:
776	free(inflate_buffer, M_TEMP);
777abort_with_zs:
778	inflateEnd(&zs);
779abort_with_fw:
780	firmware_put(fw, FIRMWARE_UNLOAD);
781	return status;
782}
783
784/*
785 * Enable or disable periodic RDMAs from the host to make certain
786 * chipsets resend dropped PCIe messages
787 */
788
789static void
790mxge_dummy_rdma(mxge_softc_t *sc, int enable)
791{
792	char buf_bytes[72];
793	volatile uint32_t *confirm;
794	volatile char *submit;
795	uint32_t *buf, dma_low, dma_high;
796	int i;
797
798	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
799
800	/* clear confirmation addr */
801	confirm = (volatile uint32_t *)sc->cmd;
802	*confirm = 0;
803	wmb();
804
805	/* send an rdma command to the PCIe engine, and wait for the
806	   response in the confirmation address.  The firmware should
807	   write a -1 there to indicate it is alive and well
808	*/
809
810	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
811	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
812	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
813	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
814	buf[2] = htobe32(0xffffffff);		/* confirm data */
815	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
816	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
817	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
818	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
819	buf[5] = htobe32(enable);			/* enable? */
820
821
822	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
823
824	mxge_pio_copy(submit, buf, 64);
825	wmb();
826	DELAY(1000);
827	wmb();
828	i = 0;
829	while (*confirm != 0xffffffff && i < 20) {
830		DELAY(1000);
831		i++;
832	}
833	if (*confirm != 0xffffffff) {
834		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
835			      (enable ? "enable" : "disable"), confirm,
836			      *confirm);
837	}
838	return;
839}
840
841static int
842mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
843{
844	mcp_cmd_t *buf;
845	char buf_bytes[sizeof(*buf) + 8];
846	volatile mcp_cmd_response_t *response = sc->cmd;
847	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
848	uint32_t dma_low, dma_high;
849	int err, sleep_total = 0;
850
851	/* ensure buf is aligned to 8 bytes */
852	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
853
854	buf->data0 = htobe32(data->data0);
855	buf->data1 = htobe32(data->data1);
856	buf->data2 = htobe32(data->data2);
857	buf->cmd = htobe32(cmd);
858	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
859	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
860
861	buf->response_addr.low = htobe32(dma_low);
862	buf->response_addr.high = htobe32(dma_high);
863	mtx_lock(&sc->cmd_mtx);
864	response->result = 0xffffffff;
865	wmb();
866	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
867
868	/* wait up to 20ms */
869	err = EAGAIN;
870	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
871		bus_dmamap_sync(sc->cmd_dma.dmat,
872				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
873		wmb();
874		switch (be32toh(response->result)) {
875		case 0:
876			data->data0 = be32toh(response->data);
877			err = 0;
878			break;
879		case 0xffffffff:
880			DELAY(1000);
881			break;
882		case MXGEFW_CMD_UNKNOWN:
883			err = ENOSYS;
884			break;
885		case MXGEFW_CMD_ERROR_UNALIGNED:
886			err = E2BIG;
887			break;
888		case MXGEFW_CMD_ERROR_BUSY:
889			err = EBUSY;
890			break;
891		case MXGEFW_CMD_ERROR_I2C_ABSENT:
892			err = ENXIO;
893			break;
894		default:
895			device_printf(sc->dev,
896				      "mxge: command %d "
897				      "failed, result = %d\n",
898				      cmd, be32toh(response->result));
899			err = ENXIO;
900			break;
901		}
902		if (err != EAGAIN)
903			break;
904	}
905	if (err == EAGAIN)
906		device_printf(sc->dev, "mxge: command %d timed out"
907			      "result = %d\n",
908			      cmd, be32toh(response->result));
909	mtx_unlock(&sc->cmd_mtx);
910	return err;
911}
912
913static int
914mxge_adopt_running_firmware(mxge_softc_t *sc)
915{
916	struct mcp_gen_header *hdr;
917	const size_t bytes = sizeof (struct mcp_gen_header);
918	size_t hdr_offset;
919	int status;
920
921	/* find running firmware header */
922	hdr_offset = htobe32(*(volatile uint32_t *)
923			     (sc->sram + MCP_HEADER_PTR_OFFSET));
924
925	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
926		device_printf(sc->dev,
927			      "Running firmware has bad header offset (%d)\n",
928			      (int)hdr_offset);
929		return EIO;
930	}
931
932	/* copy header of running firmware from SRAM to host memory to
933	 * validate firmware */
934	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
935	if (hdr == NULL) {
936		device_printf(sc->dev, "could not malloc firmware hdr\n");
937		return ENOMEM;
938	}
939	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
940				rman_get_bushandle(sc->mem_res),
941				hdr_offset, (char *)hdr, bytes);
942	status = mxge_validate_firmware(sc, hdr);
943	free(hdr, M_DEVBUF);
944
945	/*
946	 * check to see if adopted firmware has bug where adopting
947	 * it will cause broadcasts to be filtered unless the NIC
948	 * is kept in ALLMULTI mode
949	 */
950	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
951	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
952		sc->adopted_rx_filter_bug = 1;
953		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
954			      "working around rx filter bug\n",
955			      sc->fw_ver_major, sc->fw_ver_minor,
956			      sc->fw_ver_tiny);
957	}
958
959	return status;
960}
961
962
963static int
964mxge_load_firmware(mxge_softc_t *sc, int adopt)
965{
966	volatile uint32_t *confirm;
967	volatile char *submit;
968	char buf_bytes[72];
969	uint32_t *buf, size, dma_low, dma_high;
970	int status, i;
971
972	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
973
974	size = sc->sram_size;
975	status = mxge_load_firmware_helper(sc, &size);
976	if (status) {
977		if (!adopt)
978			return status;
979		/* Try to use the currently running firmware, if
980		   it is new enough */
981		status = mxge_adopt_running_firmware(sc);
982		if (status) {
983			device_printf(sc->dev,
984				      "failed to adopt running firmware\n");
985			return status;
986		}
987		device_printf(sc->dev,
988			      "Successfully adopted running firmware\n");
989		if (sc->tx_boundary == 4096) {
990			device_printf(sc->dev,
991				"Using firmware currently running on NIC"
992				 ".  For optimal\n");
993			device_printf(sc->dev,
994				 "performance consider loading optimized "
995				 "firmware\n");
996		}
997		sc->fw_name = mxge_fw_unaligned;
998		sc->tx_boundary = 2048;
999		return 0;
1000	}
1001	/* clear confirmation addr */
1002	confirm = (volatile uint32_t *)sc->cmd;
1003	*confirm = 0;
1004	wmb();
1005	/* send a reload command to the bootstrap MCP, and wait for the
1006	   response in the confirmation address.  The firmware should
1007	   write a -1 there to indicate it is alive and well
1008	*/
1009
1010	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1011	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1012
1013	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1014	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1015	buf[2] = htobe32(0xffffffff);	/* confirm data */
1016
1017	/* FIX: All newest firmware should un-protect the bottom of
1018	   the sram before handoff. However, the very first interfaces
1019	   do not. Therefore the handoff copy must skip the first 8 bytes
1020	*/
1021					/* where the code starts*/
1022	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1023	buf[4] = htobe32(size - 8); 	/* length of code */
1024	buf[5] = htobe32(8);		/* where to copy to */
1025	buf[6] = htobe32(0);		/* where to jump to */
1026
1027	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1028	mxge_pio_copy(submit, buf, 64);
1029	wmb();
1030	DELAY(1000);
1031	wmb();
1032	i = 0;
1033	while (*confirm != 0xffffffff && i < 20) {
1034		DELAY(1000*10);
1035		i++;
1036		bus_dmamap_sync(sc->cmd_dma.dmat,
1037				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1038	}
1039	if (*confirm != 0xffffffff) {
1040		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1041			confirm, *confirm);
1042
1043		return ENXIO;
1044	}
1045	return 0;
1046}
1047
1048static int
1049mxge_update_mac_address(mxge_softc_t *sc)
1050{
1051	mxge_cmd_t cmd;
1052	uint8_t *addr = sc->mac_addr;
1053	int status;
1054
1055
1056	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1057		     | (addr[2] << 8) | addr[3]);
1058
1059	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1060
1061	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1062	return status;
1063}
1064
1065static int
1066mxge_change_pause(mxge_softc_t *sc, int pause)
1067{
1068	mxge_cmd_t cmd;
1069	int status;
1070
1071	if (pause)
1072		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1073				       &cmd);
1074	else
1075		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1076				       &cmd);
1077
1078	if (status) {
1079		device_printf(sc->dev, "Failed to set flow control mode\n");
1080		return ENXIO;
1081	}
1082	sc->pause = pause;
1083	return 0;
1084}
1085
1086static void
1087mxge_change_promisc(mxge_softc_t *sc, int promisc)
1088{
1089	mxge_cmd_t cmd;
1090	int status;
1091
1092	if (mxge_always_promisc)
1093		promisc = 1;
1094
1095	if (promisc)
1096		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1097				       &cmd);
1098	else
1099		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1100				       &cmd);
1101
1102	if (status) {
1103		device_printf(sc->dev, "Failed to set promisc mode\n");
1104	}
1105}
1106
1107static void
1108mxge_set_multicast_list(mxge_softc_t *sc)
1109{
1110	mxge_cmd_t cmd;
1111	struct ifmultiaddr *ifma;
1112	struct ifnet *ifp = sc->ifp;
1113	int err;
1114
1115	/* This firmware is known to not support multicast */
1116	if (!sc->fw_multicast_support)
1117		return;
1118
1119	/* Disable multicast filtering while we play with the lists*/
1120	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1121	if (err != 0) {
1122		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1123		       " error status: %d\n", err);
1124		return;
1125	}
1126
1127	if (sc->adopted_rx_filter_bug)
1128		return;
1129
1130	if (ifp->if_flags & IFF_ALLMULTI)
1131		/* request to disable multicast filtering, so quit here */
1132		return;
1133
1134	/* Flush all the filters */
1135
1136	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1137	if (err != 0) {
1138		device_printf(sc->dev,
1139			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1140			      ", error status: %d\n", err);
1141		return;
1142	}
1143
1144	/* Walk the multicast list, and add each address */
1145
1146	if_maddr_rlock(ifp);
1147	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1148		if (ifma->ifma_addr->sa_family != AF_LINK)
1149			continue;
1150		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1151		      &cmd.data0, 4);
1152		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1153		      &cmd.data1, 2);
1154		cmd.data0 = htonl(cmd.data0);
1155		cmd.data1 = htonl(cmd.data1);
1156		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1157		if (err != 0) {
1158			device_printf(sc->dev, "Failed "
1159			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1160			       "%d\t", err);
1161			/* abort, leaving multicast filtering off */
1162			if_maddr_runlock(ifp);
1163			return;
1164		}
1165	}
1166	if_maddr_runlock(ifp);
1167	/* Enable multicast filtering */
1168	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1169	if (err != 0) {
1170		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1171		       ", error status: %d\n", err);
1172	}
1173}
1174
1175static int
1176mxge_max_mtu(mxge_softc_t *sc)
1177{
1178	mxge_cmd_t cmd;
1179	int status;
1180
1181	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1182		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1183
1184	/* try to set nbufs to see if it we can
1185	   use virtually contiguous jumbos */
1186	cmd.data0 = 0;
1187	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1188			       &cmd);
1189	if (status == 0)
1190		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1191
1192	/* otherwise, we're limited to MJUMPAGESIZE */
1193	return MJUMPAGESIZE - MXGEFW_PAD;
1194}
1195
1196static int
1197mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1198{
1199	struct mxge_slice_state *ss;
1200	mxge_rx_done_t *rx_done;
1201	volatile uint32_t *irq_claim;
1202	mxge_cmd_t cmd;
1203	int slice, status;
1204
1205	/* try to send a reset command to the card to see if it
1206	   is alive */
1207	memset(&cmd, 0, sizeof (cmd));
1208	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1209	if (status != 0) {
1210		device_printf(sc->dev, "failed reset\n");
1211		return ENXIO;
1212	}
1213
1214	mxge_dummy_rdma(sc, 1);
1215
1216
1217	/* set the intrq size */
1218	cmd.data0 = sc->rx_ring_size;
1219	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1220
1221	/*
1222	 * Even though we already know how many slices are supported
1223	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1224	 * has magic side effects, and must be called after a reset.
1225	 * It must be called prior to calling any RSS related cmds,
1226	 * including assigning an interrupt queue for anything but
1227	 * slice 0.  It must also be called *after*
1228	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1229	 * the firmware to compute offsets.
1230	 */
1231
1232	if (sc->num_slices > 1) {
1233		/* ask the maximum number of slices it supports */
1234		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1235					   &cmd);
1236		if (status != 0) {
1237			device_printf(sc->dev,
1238				      "failed to get number of slices\n");
1239			return status;
1240		}
1241		/*
1242		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1243		 * to setting up the interrupt queue DMA
1244		 */
1245		cmd.data0 = sc->num_slices;
1246		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1247#ifdef IFNET_BUF_RING
1248		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1249#endif
1250		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1251					   &cmd);
1252		if (status != 0) {
1253			device_printf(sc->dev,
1254				      "failed to set number of slices\n");
1255			return status;
1256		}
1257	}
1258
1259
1260	if (interrupts_setup) {
1261		/* Now exchange information about interrupts  */
1262		for (slice = 0; slice < sc->num_slices; slice++) {
1263			rx_done = &sc->ss[slice].rx_done;
1264			memset(rx_done->entry, 0, sc->rx_ring_size);
1265			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1266			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1267			cmd.data2 = slice;
1268			status |= mxge_send_cmd(sc,
1269						MXGEFW_CMD_SET_INTRQ_DMA,
1270						&cmd);
1271		}
1272	}
1273
1274	status |= mxge_send_cmd(sc,
1275				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1276
1277
1278	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1279
1280	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1281	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1282
1283
1284	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1285				&cmd);
1286	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1287	if (status != 0) {
1288		device_printf(sc->dev, "failed set interrupt parameters\n");
1289		return status;
1290	}
1291
1292
1293	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1294
1295
1296	/* run a DMA benchmark */
1297	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1298
1299	for (slice = 0; slice < sc->num_slices; slice++) {
1300		ss = &sc->ss[slice];
1301
1302		ss->irq_claim = irq_claim + (2 * slice);
1303		/* reset mcp/driver shared state back to 0 */
1304		ss->rx_done.idx = 0;
1305		ss->rx_done.cnt = 0;
1306		ss->tx.req = 0;
1307		ss->tx.done = 0;
1308		ss->tx.pkt_done = 0;
1309		ss->tx.queue_active = 0;
1310		ss->tx.activate = 0;
1311		ss->tx.deactivate = 0;
1312		ss->tx.wake = 0;
1313		ss->tx.defrag = 0;
1314		ss->tx.stall = 0;
1315		ss->rx_big.cnt = 0;
1316		ss->rx_small.cnt = 0;
1317		ss->lc.lro_bad_csum = 0;
1318		ss->lc.lro_queued = 0;
1319		ss->lc.lro_flushed = 0;
1320		if (ss->fw_stats != NULL) {
1321			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1322		}
1323	}
1324	sc->rdma_tags_available = 15;
1325	status = mxge_update_mac_address(sc);
1326	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1327	mxge_change_pause(sc, sc->pause);
1328	mxge_set_multicast_list(sc);
1329	if (sc->throttle) {
1330		cmd.data0 = sc->throttle;
1331		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1332				  &cmd)) {
1333			device_printf(sc->dev,
1334				      "can't enable throttle\n");
1335		}
1336	}
1337	return status;
1338}
1339
1340static int
1341mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1342{
1343	mxge_cmd_t cmd;
1344	mxge_softc_t *sc;
1345	int err;
1346	unsigned int throttle;
1347
1348	sc = arg1;
1349	throttle = sc->throttle;
1350	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1351        if (err != 0) {
1352                return err;
1353        }
1354
1355	if (throttle == sc->throttle)
1356		return 0;
1357
1358        if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1359                return EINVAL;
1360
1361	mtx_lock(&sc->driver_mtx);
1362	cmd.data0 = throttle;
1363	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1364	if (err == 0)
1365		sc->throttle = throttle;
1366	mtx_unlock(&sc->driver_mtx);
1367	return err;
1368}
1369
1370static int
1371mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1372{
1373        mxge_softc_t *sc;
1374        unsigned int intr_coal_delay;
1375        int err;
1376
1377        sc = arg1;
1378        intr_coal_delay = sc->intr_coal_delay;
1379        err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1380        if (err != 0) {
1381                return err;
1382        }
1383        if (intr_coal_delay == sc->intr_coal_delay)
1384                return 0;
1385
1386        if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1387                return EINVAL;
1388
1389	mtx_lock(&sc->driver_mtx);
1390	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1391	sc->intr_coal_delay = intr_coal_delay;
1392
1393	mtx_unlock(&sc->driver_mtx);
1394        return err;
1395}
1396
1397static int
1398mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1399{
1400        mxge_softc_t *sc;
1401        unsigned int enabled;
1402        int err;
1403
1404        sc = arg1;
1405        enabled = sc->pause;
1406        err = sysctl_handle_int(oidp, &enabled, arg2, req);
1407        if (err != 0) {
1408                return err;
1409        }
1410        if (enabled == sc->pause)
1411                return 0;
1412
1413	mtx_lock(&sc->driver_mtx);
1414	err = mxge_change_pause(sc, enabled);
1415	mtx_unlock(&sc->driver_mtx);
1416        return err;
1417}
1418
1419static int
1420mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1421{
1422        int err;
1423
1424        if (arg1 == NULL)
1425                return EFAULT;
1426        arg2 = be32toh(*(int *)arg1);
1427        arg1 = NULL;
1428        err = sysctl_handle_int(oidp, arg1, arg2, req);
1429
1430        return err;
1431}
1432
1433static void
1434mxge_rem_sysctls(mxge_softc_t *sc)
1435{
1436	struct mxge_slice_state *ss;
1437	int slice;
1438
1439	if (sc->slice_sysctl_tree == NULL)
1440		return;
1441
1442	for (slice = 0; slice < sc->num_slices; slice++) {
1443		ss = &sc->ss[slice];
1444		if (ss == NULL || ss->sysctl_tree == NULL)
1445			continue;
1446		sysctl_ctx_free(&ss->sysctl_ctx);
1447		ss->sysctl_tree = NULL;
1448	}
1449	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1450	sc->slice_sysctl_tree = NULL;
1451}
1452
1453static void
1454mxge_add_sysctls(mxge_softc_t *sc)
1455{
1456	struct sysctl_ctx_list *ctx;
1457	struct sysctl_oid_list *children;
1458	mcp_irq_data_t *fw;
1459	struct mxge_slice_state *ss;
1460	int slice;
1461	char slice_num[8];
1462
1463	ctx = device_get_sysctl_ctx(sc->dev);
1464	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1465	fw = sc->ss[0].fw_stats;
1466
1467	/* random information */
1468	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1469		       "firmware_version",
1470		       CTLFLAG_RD, sc->fw_version,
1471		       0, "firmware version");
1472	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1473		       "serial_number",
1474		       CTLFLAG_RD, sc->serial_number_string,
1475		       0, "serial number");
1476	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1477		       "product_code",
1478		       CTLFLAG_RD, sc->product_code_string,
1479		       0, "product_code");
1480	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1481		       "pcie_link_width",
1482		       CTLFLAG_RD, &sc->link_width,
1483		       0, "tx_boundary");
1484	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1485		       "tx_boundary",
1486		       CTLFLAG_RD, &sc->tx_boundary,
1487		       0, "tx_boundary");
1488	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1489		       "write_combine",
1490		       CTLFLAG_RD, &sc->wc,
1491		       0, "write combining PIO?");
1492	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1493		       "read_dma_MBs",
1494		       CTLFLAG_RD, &sc->read_dma,
1495		       0, "DMA Read speed in MB/s");
1496	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1497		       "write_dma_MBs",
1498		       CTLFLAG_RD, &sc->write_dma,
1499		       0, "DMA Write speed in MB/s");
1500	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1501		       "read_write_dma_MBs",
1502		       CTLFLAG_RD, &sc->read_write_dma,
1503		       0, "DMA concurrent Read/Write speed in MB/s");
1504	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1505		       "watchdog_resets",
1506		       CTLFLAG_RD, &sc->watchdog_resets,
1507		       0, "Number of times NIC was reset");
1508
1509
1510	/* performance related tunables */
1511	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1512			"intr_coal_delay",
1513			CTLTYPE_INT|CTLFLAG_RW, sc,
1514			0, mxge_change_intr_coal,
1515			"I", "interrupt coalescing delay in usecs");
1516
1517	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1518			"throttle",
1519			CTLTYPE_INT|CTLFLAG_RW, sc,
1520			0, mxge_change_throttle,
1521			"I", "transmit throttling");
1522
1523	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1524			"flow_control_enabled",
1525			CTLTYPE_INT|CTLFLAG_RW, sc,
1526			0, mxge_change_flow_control,
1527			"I", "interrupt coalescing delay in usecs");
1528
1529	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1530		       "deassert_wait",
1531		       CTLFLAG_RW, &mxge_deassert_wait,
1532		       0, "Wait for IRQ line to go low in ihandler");
1533
1534	/* stats block from firmware is in network byte order.
1535	   Need to swap it */
1536	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1537			"link_up",
1538			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1539			0, mxge_handle_be32,
1540			"I", "link up");
1541	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1542			"rdma_tags_available",
1543			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1544			0, mxge_handle_be32,
1545			"I", "rdma_tags_available");
1546	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1547			"dropped_bad_crc32",
1548			CTLTYPE_INT|CTLFLAG_RD,
1549			&fw->dropped_bad_crc32,
1550			0, mxge_handle_be32,
1551			"I", "dropped_bad_crc32");
1552	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1553			"dropped_bad_phy",
1554			CTLTYPE_INT|CTLFLAG_RD,
1555			&fw->dropped_bad_phy,
1556			0, mxge_handle_be32,
1557			"I", "dropped_bad_phy");
1558	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1559			"dropped_link_error_or_filtered",
1560			CTLTYPE_INT|CTLFLAG_RD,
1561			&fw->dropped_link_error_or_filtered,
1562			0, mxge_handle_be32,
1563			"I", "dropped_link_error_or_filtered");
1564	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1565			"dropped_link_overflow",
1566			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1567			0, mxge_handle_be32,
1568			"I", "dropped_link_overflow");
1569	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1570			"dropped_multicast_filtered",
1571			CTLTYPE_INT|CTLFLAG_RD,
1572			&fw->dropped_multicast_filtered,
1573			0, mxge_handle_be32,
1574			"I", "dropped_multicast_filtered");
1575	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1576			"dropped_no_big_buffer",
1577			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1578			0, mxge_handle_be32,
1579			"I", "dropped_no_big_buffer");
1580	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1581			"dropped_no_small_buffer",
1582			CTLTYPE_INT|CTLFLAG_RD,
1583			&fw->dropped_no_small_buffer,
1584			0, mxge_handle_be32,
1585			"I", "dropped_no_small_buffer");
1586	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1587			"dropped_overrun",
1588			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1589			0, mxge_handle_be32,
1590			"I", "dropped_overrun");
1591	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1592			"dropped_pause",
1593			CTLTYPE_INT|CTLFLAG_RD,
1594			&fw->dropped_pause,
1595			0, mxge_handle_be32,
1596			"I", "dropped_pause");
1597	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1598			"dropped_runt",
1599			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1600			0, mxge_handle_be32,
1601			"I", "dropped_runt");
1602
1603	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1604			"dropped_unicast_filtered",
1605			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1606			0, mxge_handle_be32,
1607			"I", "dropped_unicast_filtered");
1608
1609	/* verbose printing? */
1610	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1611		       "verbose",
1612		       CTLFLAG_RW, &mxge_verbose,
1613		       0, "verbose printing");
1614
1615	/* add counters exported for debugging from all slices */
1616	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1617	sc->slice_sysctl_tree =
1618		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1619				"slice", CTLFLAG_RD, 0, "");
1620
1621	for (slice = 0; slice < sc->num_slices; slice++) {
1622		ss = &sc->ss[slice];
1623		sysctl_ctx_init(&ss->sysctl_ctx);
1624		ctx = &ss->sysctl_ctx;
1625		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1626		sprintf(slice_num, "%d", slice);
1627		ss->sysctl_tree =
1628			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1629					CTLFLAG_RD, 0, "");
1630		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1631		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1632			       "rx_small_cnt",
1633			       CTLFLAG_RD, &ss->rx_small.cnt,
1634			       0, "rx_small_cnt");
1635		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1636			       "rx_big_cnt",
1637			       CTLFLAG_RD, &ss->rx_big.cnt,
1638			       0, "rx_small_cnt");
1639		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1640			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1641			       0, "number of lro merge queues flushed");
1642
1643		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1644			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1645			       0, "number of bad csums preventing LRO");
1646
1647		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1648			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1649			       0, "number of frames appended to lro merge"
1650			       "queues");
1651
1652#ifndef IFNET_BUF_RING
1653		/* only transmit from slice 0 for now */
1654		if (slice > 0)
1655			continue;
1656#endif
1657		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1658			       "tx_req",
1659			       CTLFLAG_RD, &ss->tx.req,
1660			       0, "tx_req");
1661
1662		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1663			       "tx_done",
1664			       CTLFLAG_RD, &ss->tx.done,
1665			       0, "tx_done");
1666		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1667			       "tx_pkt_done",
1668			       CTLFLAG_RD, &ss->tx.pkt_done,
1669			       0, "tx_done");
1670		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1671			       "tx_stall",
1672			       CTLFLAG_RD, &ss->tx.stall,
1673			       0, "tx_stall");
1674		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1675			       "tx_wake",
1676			       CTLFLAG_RD, &ss->tx.wake,
1677			       0, "tx_wake");
1678		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1679			       "tx_defrag",
1680			       CTLFLAG_RD, &ss->tx.defrag,
1681			       0, "tx_defrag");
1682		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1683			       "tx_queue_active",
1684			       CTLFLAG_RD, &ss->tx.queue_active,
1685			       0, "tx_queue_active");
1686		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1687			       "tx_activate",
1688			       CTLFLAG_RD, &ss->tx.activate,
1689			       0, "tx_activate");
1690		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1691			       "tx_deactivate",
1692			       CTLFLAG_RD, &ss->tx.deactivate,
1693			       0, "tx_deactivate");
1694	}
1695}
1696
1697/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1698   backwards one at a time and handle ring wraps */
1699
1700static inline void
1701mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1702			    mcp_kreq_ether_send_t *src, int cnt)
1703{
1704        int idx, starting_slot;
1705        starting_slot = tx->req;
1706        while (cnt > 1) {
1707                cnt--;
1708                idx = (starting_slot + cnt) & tx->mask;
1709                mxge_pio_copy(&tx->lanai[idx],
1710			      &src[cnt], sizeof(*src));
1711                wmb();
1712        }
1713}
1714
1715/*
1716 * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1717 * at most 32 bytes at a time, so as to avoid involving the software
1718 * pio handler in the nic.   We re-write the first segment's flags
1719 * to mark them valid only after writing the entire chain
1720 */
1721
1722static inline void
1723mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1724                  int cnt)
1725{
1726        int idx, i;
1727        uint32_t *src_ints;
1728	volatile uint32_t *dst_ints;
1729        mcp_kreq_ether_send_t *srcp;
1730	volatile mcp_kreq_ether_send_t *dstp, *dst;
1731	uint8_t last_flags;
1732
1733        idx = tx->req & tx->mask;
1734
1735	last_flags = src->flags;
1736	src->flags = 0;
1737        wmb();
1738        dst = dstp = &tx->lanai[idx];
1739        srcp = src;
1740
1741        if ((idx + cnt) < tx->mask) {
1742                for (i = 0; i < (cnt - 1); i += 2) {
1743                        mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1744                        wmb(); /* force write every 32 bytes */
1745                        srcp += 2;
1746                        dstp += 2;
1747                }
1748        } else {
1749                /* submit all but the first request, and ensure
1750                   that it is submitted below */
1751                mxge_submit_req_backwards(tx, src, cnt);
1752                i = 0;
1753        }
1754        if (i < cnt) {
1755                /* submit the first request */
1756                mxge_pio_copy(dstp, srcp, sizeof(*src));
1757                wmb(); /* barrier before setting valid flag */
1758        }
1759
1760        /* re-write the last 32-bits with the valid flags */
1761        src->flags = last_flags;
1762        src_ints = (uint32_t *)src;
1763        src_ints+=3;
1764        dst_ints = (volatile uint32_t *)dst;
1765        dst_ints+=3;
1766        *dst_ints =  *src_ints;
1767        tx->req += cnt;
1768        wmb();
1769}
1770
1771static int
1772mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1773    struct mxge_pkt_info *pi)
1774{
1775	struct ether_vlan_header *eh;
1776	uint16_t etype;
1777	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1778#if IFCAP_TSO6 && defined(INET6)
1779	int nxt;
1780#endif
1781
1782	eh = mtod(m, struct ether_vlan_header *);
1783	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1784		etype = ntohs(eh->evl_proto);
1785		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1786	} else {
1787		etype = ntohs(eh->evl_encap_proto);
1788		pi->ip_off = ETHER_HDR_LEN;
1789	}
1790
1791	switch (etype) {
1792	case ETHERTYPE_IP:
1793		/*
1794		 * ensure ip header is in first mbuf, copy it to a
1795		 * scratch buffer if not
1796		 */
1797		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1798		pi->ip6 = NULL;
1799		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1800			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1801			    ss->scratch);
1802			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1803		}
1804		pi->ip_hlen = pi->ip->ip_hl << 2;
1805		if (!tso)
1806			return 0;
1807
1808		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1809		    sizeof(struct tcphdr))) {
1810			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1811			    sizeof(struct tcphdr), ss->scratch);
1812			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1813		}
1814		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1815		break;
1816#if IFCAP_TSO6 && defined(INET6)
1817	case ETHERTYPE_IPV6:
1818		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1819		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1820			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1821			    ss->scratch);
1822			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1823		}
1824		nxt = 0;
1825		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1826		pi->ip_hlen -= pi->ip_off;
1827		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1828			return EINVAL;
1829
1830		if (!tso)
1831			return 0;
1832
1833		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1834			return EINVAL;
1835
1836		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1837		    sizeof(struct tcphdr))) {
1838			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1839			    sizeof(struct tcphdr), ss->scratch);
1840			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1841		}
1842		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1843		break;
1844#endif
1845	default:
1846		return EINVAL;
1847	}
1848	return 0;
1849}
1850
1851#if IFCAP_TSO4
1852
1853static void
1854mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1855	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1856{
1857	mxge_tx_ring_t *tx;
1858	mcp_kreq_ether_send_t *req;
1859	bus_dma_segment_t *seg;
1860	uint32_t low, high_swapped;
1861	int len, seglen, cum_len, cum_len_next;
1862	int next_is_first, chop, cnt, rdma_count, small;
1863	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1864	uint8_t flags, flags_next;
1865	static int once;
1866
1867	mss = m->m_pkthdr.tso_segsz;
1868
1869	/* negative cum_len signifies to the
1870	 * send loop that we are still in the
1871	 * header portion of the TSO packet.
1872	 */
1873
1874	cksum_offset = pi->ip_off + pi->ip_hlen;
1875	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1876
1877	/* TSO implies checksum offload on this hardware */
1878	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1879		/*
1880		 * If packet has full TCP csum, replace it with pseudo hdr
1881		 * sum that the NIC expects, otherwise the NIC will emit
1882		 * packets with bad TCP checksums.
1883		 */
1884		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1885		if (pi->ip6) {
1886#if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1887			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1888			sum = in6_cksum_pseudo(pi->ip6,
1889			    m->m_pkthdr.len - cksum_offset,
1890			    IPPROTO_TCP, 0);
1891#endif
1892		} else {
1893#ifdef INET
1894			m->m_pkthdr.csum_flags |= CSUM_TCP;
1895			sum = in_pseudo(pi->ip->ip_src.s_addr,
1896			    pi->ip->ip_dst.s_addr,
1897			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1898				    cksum_offset)));
1899#endif
1900		}
1901		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1902		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1903	}
1904	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1905
1906
1907	/* for TSO, pseudo_hdr_offset holds mss.
1908	 * The firmware figures out where to put
1909	 * the checksum by parsing the header. */
1910	pseudo_hdr_offset = htobe16(mss);
1911
1912	if (pi->ip6) {
1913		/*
1914		 * for IPv6 TSO, the "checksum offset" is re-purposed
1915		 * to store the TCP header len
1916		 */
1917		cksum_offset = (pi->tcp->th_off << 2);
1918	}
1919
1920	tx = &ss->tx;
1921	req = tx->req_list;
1922	seg = tx->seg_list;
1923	cnt = 0;
1924	rdma_count = 0;
1925	/* "rdma_count" is the number of RDMAs belonging to the
1926	 * current packet BEFORE the current send request. For
1927	 * non-TSO packets, this is equal to "count".
1928	 * For TSO packets, rdma_count needs to be reset
1929	 * to 0 after a segment cut.
1930	 *
1931	 * The rdma_count field of the send request is
1932	 * the number of RDMAs of the packet starting at
1933	 * that request. For TSO send requests with one ore more cuts
1934	 * in the middle, this is the number of RDMAs starting
1935	 * after the last cut in the request. All previous
1936	 * segments before the last cut implicitly have 1 RDMA.
1937	 *
1938	 * Since the number of RDMAs is not known beforehand,
1939	 * it must be filled-in retroactively - after each
1940	 * segmentation cut or at the end of the entire packet.
1941	 */
1942
1943	while (busdma_seg_cnt) {
1944		/* Break the busdma segment up into pieces*/
1945		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1946		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1947		len = seg->ds_len;
1948
1949		while (len) {
1950			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1951			seglen = len;
1952			cum_len_next = cum_len + seglen;
1953			(req-rdma_count)->rdma_count = rdma_count + 1;
1954			if (__predict_true(cum_len >= 0)) {
1955				/* payload */
1956				chop = (cum_len_next > mss);
1957				cum_len_next = cum_len_next % mss;
1958				next_is_first = (cum_len_next == 0);
1959				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1960				flags_next |= next_is_first *
1961					MXGEFW_FLAGS_FIRST;
1962				rdma_count |= -(chop | next_is_first);
1963				rdma_count += chop & !next_is_first;
1964			} else if (cum_len_next >= 0) {
1965				/* header ends */
1966				rdma_count = -1;
1967				cum_len_next = 0;
1968				seglen = -cum_len;
1969				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1970				flags_next = MXGEFW_FLAGS_TSO_PLD |
1971					MXGEFW_FLAGS_FIRST |
1972					(small * MXGEFW_FLAGS_SMALL);
1973			    }
1974
1975			req->addr_high = high_swapped;
1976			req->addr_low = htobe32(low);
1977			req->pseudo_hdr_offset = pseudo_hdr_offset;
1978			req->pad = 0;
1979			req->rdma_count = 1;
1980			req->length = htobe16(seglen);
1981			req->cksum_offset = cksum_offset;
1982			req->flags = flags | ((cum_len & 1) *
1983					      MXGEFW_FLAGS_ALIGN_ODD);
1984			low += seglen;
1985			len -= seglen;
1986			cum_len = cum_len_next;
1987			flags = flags_next;
1988			req++;
1989			cnt++;
1990			rdma_count++;
1991			if (cksum_offset != 0 && !pi->ip6) {
1992				if (__predict_false(cksum_offset > seglen))
1993					cksum_offset -= seglen;
1994				else
1995					cksum_offset = 0;
1996			}
1997			if (__predict_false(cnt > tx->max_desc))
1998				goto drop;
1999		}
2000		busdma_seg_cnt--;
2001		seg++;
2002	}
2003	(req-rdma_count)->rdma_count = rdma_count;
2004
2005	do {
2006		req--;
2007		req->flags |= MXGEFW_FLAGS_TSO_LAST;
2008	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2009
2010	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2011	mxge_submit_req(tx, tx->req_list, cnt);
2012#ifdef IFNET_BUF_RING
2013	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2014		/* tell the NIC to start polling this slice */
2015		*tx->send_go = 1;
2016		tx->queue_active = 1;
2017		tx->activate++;
2018		wmb();
2019	}
2020#endif
2021	return;
2022
2023drop:
2024	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2025	m_freem(m);
2026	ss->oerrors++;
2027	if (!once) {
2028		printf("tx->max_desc exceeded via TSO!\n");
2029		printf("mss = %d, %ld, %d!\n", mss,
2030		       (long)seg - (long)tx->seg_list, tx->max_desc);
2031		once = 1;
2032	}
2033	return;
2034
2035}
2036
2037#endif /* IFCAP_TSO4 */
2038
2039#ifdef MXGE_NEW_VLAN_API
2040/*
2041 * We reproduce the software vlan tag insertion from
2042 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2043 * vlan tag insertion. We need to advertise this in order to have the
2044 * vlan interface respect our csum offload flags.
2045 */
2046static struct mbuf *
2047mxge_vlan_tag_insert(struct mbuf *m)
2048{
2049	struct ether_vlan_header *evl;
2050
2051	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2052	if (__predict_false(m == NULL))
2053		return NULL;
2054	if (m->m_len < sizeof(*evl)) {
2055		m = m_pullup(m, sizeof(*evl));
2056		if (__predict_false(m == NULL))
2057			return NULL;
2058	}
2059	/*
2060	 * Transform the Ethernet header into an Ethernet header
2061	 * with 802.1Q encapsulation.
2062	 */
2063	evl = mtod(m, struct ether_vlan_header *);
2064	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2065	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2066	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2067	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2068	m->m_flags &= ~M_VLANTAG;
2069	return m;
2070}
2071#endif /* MXGE_NEW_VLAN_API */
2072
2073static void
2074mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2075{
2076	struct mxge_pkt_info pi = {0,0,0,0};
2077	mxge_softc_t *sc;
2078	mcp_kreq_ether_send_t *req;
2079	bus_dma_segment_t *seg;
2080	struct mbuf *m_tmp;
2081	struct ifnet *ifp;
2082	mxge_tx_ring_t *tx;
2083	int cnt, cum_len, err, i, idx, odd_flag;
2084	uint16_t pseudo_hdr_offset;
2085        uint8_t flags, cksum_offset;
2086
2087
2088	sc = ss->sc;
2089	ifp = sc->ifp;
2090	tx = &ss->tx;
2091
2092#ifdef MXGE_NEW_VLAN_API
2093	if (m->m_flags & M_VLANTAG) {
2094		m = mxge_vlan_tag_insert(m);
2095		if (__predict_false(m == NULL))
2096			goto drop_without_m;
2097	}
2098#endif
2099	if (m->m_pkthdr.csum_flags &
2100	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2101		if (mxge_parse_tx(ss, m, &pi))
2102			goto drop;
2103	}
2104
2105	/* (try to) map the frame for DMA */
2106	idx = tx->req & tx->mask;
2107	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2108				      m, tx->seg_list, &cnt,
2109				      BUS_DMA_NOWAIT);
2110	if (__predict_false(err == EFBIG)) {
2111		/* Too many segments in the chain.  Try
2112		   to defrag */
2113		m_tmp = m_defrag(m, M_NOWAIT);
2114		if (m_tmp == NULL) {
2115			goto drop;
2116		}
2117		ss->tx.defrag++;
2118		m = m_tmp;
2119		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2120					      tx->info[idx].map,
2121					      m, tx->seg_list, &cnt,
2122					      BUS_DMA_NOWAIT);
2123	}
2124	if (__predict_false(err != 0)) {
2125		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2126			      " packet len = %d\n", err, m->m_pkthdr.len);
2127		goto drop;
2128	}
2129	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2130			BUS_DMASYNC_PREWRITE);
2131	tx->info[idx].m = m;
2132
2133#if IFCAP_TSO4
2134	/* TSO is different enough, we handle it in another routine */
2135	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2136		mxge_encap_tso(ss, m, cnt, &pi);
2137		return;
2138	}
2139#endif
2140
2141	req = tx->req_list;
2142	cksum_offset = 0;
2143	pseudo_hdr_offset = 0;
2144	flags = MXGEFW_FLAGS_NO_TSO;
2145
2146	/* checksum offloading? */
2147	if (m->m_pkthdr.csum_flags &
2148	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2149		/* ensure ip header is in first mbuf, copy
2150		   it to a scratch buffer if not */
2151		cksum_offset = pi.ip_off + pi.ip_hlen;
2152		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2153		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2154		req->cksum_offset = cksum_offset;
2155		flags |= MXGEFW_FLAGS_CKSUM;
2156		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2157	} else {
2158		odd_flag = 0;
2159	}
2160	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2161		flags |= MXGEFW_FLAGS_SMALL;
2162
2163	/* convert segments into a request list */
2164	cum_len = 0;
2165	seg = tx->seg_list;
2166	req->flags = MXGEFW_FLAGS_FIRST;
2167	for (i = 0; i < cnt; i++) {
2168		req->addr_low =
2169			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2170		req->addr_high =
2171			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2172		req->length = htobe16(seg->ds_len);
2173		req->cksum_offset = cksum_offset;
2174		if (cksum_offset > seg->ds_len)
2175			cksum_offset -= seg->ds_len;
2176		else
2177			cksum_offset = 0;
2178		req->pseudo_hdr_offset = pseudo_hdr_offset;
2179		req->pad = 0; /* complete solid 16-byte block */
2180		req->rdma_count = 1;
2181		req->flags |= flags | ((cum_len & 1) * odd_flag);
2182		cum_len += seg->ds_len;
2183		seg++;
2184		req++;
2185		req->flags = 0;
2186	}
2187	req--;
2188	/* pad runts to 60 bytes */
2189	if (cum_len < 60) {
2190		req++;
2191		req->addr_low =
2192			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2193		req->addr_high =
2194			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2195		req->length = htobe16(60 - cum_len);
2196		req->cksum_offset = 0;
2197		req->pseudo_hdr_offset = pseudo_hdr_offset;
2198		req->pad = 0; /* complete solid 16-byte block */
2199		req->rdma_count = 1;
2200		req->flags |= flags | ((cum_len & 1) * odd_flag);
2201		cnt++;
2202	}
2203
2204	tx->req_list[0].rdma_count = cnt;
2205#if 0
2206	/* print what the firmware will see */
2207	for (i = 0; i < cnt; i++) {
2208		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2209		    "cso:%d, flags:0x%x, rdma:%d\n",
2210		    i, (int)ntohl(tx->req_list[i].addr_high),
2211		    (int)ntohl(tx->req_list[i].addr_low),
2212		    (int)ntohs(tx->req_list[i].length),
2213		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2214		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2215		    tx->req_list[i].rdma_count);
2216	}
2217	printf("--------------\n");
2218#endif
2219	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2220	mxge_submit_req(tx, tx->req_list, cnt);
2221#ifdef IFNET_BUF_RING
2222	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2223		/* tell the NIC to start polling this slice */
2224		*tx->send_go = 1;
2225		tx->queue_active = 1;
2226		tx->activate++;
2227		wmb();
2228	}
2229#endif
2230	return;
2231
2232drop:
2233	m_freem(m);
2234drop_without_m:
2235	ss->oerrors++;
2236	return;
2237}
2238
2239#ifdef IFNET_BUF_RING
2240static void
2241mxge_qflush(struct ifnet *ifp)
2242{
2243	mxge_softc_t *sc = ifp->if_softc;
2244	mxge_tx_ring_t *tx;
2245	struct mbuf *m;
2246	int slice;
2247
2248	for (slice = 0; slice < sc->num_slices; slice++) {
2249		tx = &sc->ss[slice].tx;
2250		mtx_lock(&tx->mtx);
2251		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2252			m_freem(m);
2253		mtx_unlock(&tx->mtx);
2254	}
2255	if_qflush(ifp);
2256}
2257
2258static inline void
2259mxge_start_locked(struct mxge_slice_state *ss)
2260{
2261	mxge_softc_t *sc;
2262	struct mbuf *m;
2263	struct ifnet *ifp;
2264	mxge_tx_ring_t *tx;
2265
2266	sc = ss->sc;
2267	ifp = sc->ifp;
2268	tx = &ss->tx;
2269
2270	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2271		m = drbr_dequeue(ifp, tx->br);
2272		if (m == NULL) {
2273			return;
2274		}
2275		/* let BPF see it */
2276		BPF_MTAP(ifp, m);
2277
2278		/* give it to the nic */
2279		mxge_encap(ss, m);
2280	}
2281	/* ran out of transmit slots */
2282	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2283	    && (!drbr_empty(ifp, tx->br))) {
2284		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2285		tx->stall++;
2286	}
2287}
2288
2289static int
2290mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2291{
2292	mxge_softc_t *sc;
2293	struct ifnet *ifp;
2294	mxge_tx_ring_t *tx;
2295	int err;
2296
2297	sc = ss->sc;
2298	ifp = sc->ifp;
2299	tx = &ss->tx;
2300
2301	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2302	    IFF_DRV_RUNNING) {
2303		err = drbr_enqueue(ifp, tx->br, m);
2304		return (err);
2305	}
2306
2307	if (!drbr_needs_enqueue(ifp, tx->br) &&
2308	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2309		/* let BPF see it */
2310		BPF_MTAP(ifp, m);
2311		/* give it to the nic */
2312		mxge_encap(ss, m);
2313	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2314		return (err);
2315	}
2316	if (!drbr_empty(ifp, tx->br))
2317		mxge_start_locked(ss);
2318	return (0);
2319}
2320
2321static int
2322mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2323{
2324	mxge_softc_t *sc = ifp->if_softc;
2325	struct mxge_slice_state *ss;
2326	mxge_tx_ring_t *tx;
2327	int err = 0;
2328	int slice;
2329
2330	slice = m->m_pkthdr.flowid;
2331	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2332
2333	ss = &sc->ss[slice];
2334	tx = &ss->tx;
2335
2336	if (mtx_trylock(&tx->mtx)) {
2337		err = mxge_transmit_locked(ss, m);
2338		mtx_unlock(&tx->mtx);
2339	} else {
2340		err = drbr_enqueue(ifp, tx->br, m);
2341	}
2342
2343	return (err);
2344}
2345
2346#else
2347
2348static inline void
2349mxge_start_locked(struct mxge_slice_state *ss)
2350{
2351	mxge_softc_t *sc;
2352	struct mbuf *m;
2353	struct ifnet *ifp;
2354	mxge_tx_ring_t *tx;
2355
2356	sc = ss->sc;
2357	ifp = sc->ifp;
2358	tx = &ss->tx;
2359	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2360		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2361		if (m == NULL) {
2362			return;
2363		}
2364		/* let BPF see it */
2365		BPF_MTAP(ifp, m);
2366
2367		/* give it to the nic */
2368		mxge_encap(ss, m);
2369	}
2370	/* ran out of transmit slots */
2371	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2372		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2373		tx->stall++;
2374	}
2375}
2376#endif
2377static void
2378mxge_start(struct ifnet *ifp)
2379{
2380	mxge_softc_t *sc = ifp->if_softc;
2381	struct mxge_slice_state *ss;
2382
2383	/* only use the first slice for now */
2384	ss = &sc->ss[0];
2385	mtx_lock(&ss->tx.mtx);
2386	mxge_start_locked(ss);
2387	mtx_unlock(&ss->tx.mtx);
2388}
2389
2390/*
2391 * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2392 * at most 32 bytes at a time, so as to avoid involving the software
2393 * pio handler in the nic.   We re-write the first segment's low
2394 * DMA address to mark it valid only after we write the entire chunk
2395 * in a burst
2396 */
2397static inline void
2398mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2399		mcp_kreq_ether_recv_t *src)
2400{
2401	uint32_t low;
2402
2403	low = src->addr_low;
2404	src->addr_low = 0xffffffff;
2405	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2406	wmb();
2407	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2408	wmb();
2409	src->addr_low = low;
2410	dst->addr_low = low;
2411	wmb();
2412}
2413
2414static int
2415mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2416{
2417	bus_dma_segment_t seg;
2418	struct mbuf *m;
2419	mxge_rx_ring_t *rx = &ss->rx_small;
2420	int cnt, err;
2421
2422	m = m_gethdr(M_NOWAIT, MT_DATA);
2423	if (m == NULL) {
2424		rx->alloc_fail++;
2425		err = ENOBUFS;
2426		goto done;
2427	}
2428	m->m_len = MHLEN;
2429	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2430				      &seg, &cnt, BUS_DMA_NOWAIT);
2431	if (err != 0) {
2432		m_free(m);
2433		goto done;
2434	}
2435	rx->info[idx].m = m;
2436	rx->shadow[idx].addr_low =
2437		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2438	rx->shadow[idx].addr_high =
2439		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2440
2441done:
2442	if ((idx & 7) == 7)
2443		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2444	return err;
2445}
2446
2447static int
2448mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2449{
2450	bus_dma_segment_t seg[3];
2451	struct mbuf *m;
2452	mxge_rx_ring_t *rx = &ss->rx_big;
2453	int cnt, err, i;
2454
2455	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2456	if (m == NULL) {
2457		rx->alloc_fail++;
2458		err = ENOBUFS;
2459		goto done;
2460	}
2461	m->m_len = rx->mlen;
2462	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2463				      seg, &cnt, BUS_DMA_NOWAIT);
2464	if (err != 0) {
2465		m_free(m);
2466		goto done;
2467	}
2468	rx->info[idx].m = m;
2469	rx->shadow[idx].addr_low =
2470		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2471	rx->shadow[idx].addr_high =
2472		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2473
2474#if MXGE_VIRT_JUMBOS
2475	for (i = 1; i < cnt; i++) {
2476		rx->shadow[idx + i].addr_low =
2477			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2478		rx->shadow[idx + i].addr_high =
2479			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2480       }
2481#endif
2482
2483done:
2484       for (i = 0; i < rx->nbufs; i++) {
2485		if ((idx & 7) == 7) {
2486			mxge_submit_8rx(&rx->lanai[idx - 7],
2487					&rx->shadow[idx - 7]);
2488		}
2489		idx++;
2490	}
2491	return err;
2492}
2493
2494#ifdef INET6
2495
2496static uint16_t
2497mxge_csum_generic(uint16_t *raw, int len)
2498{
2499	uint32_t csum;
2500
2501
2502	csum = 0;
2503	while (len > 0) {
2504		csum += *raw;
2505		raw++;
2506		len -= 2;
2507	}
2508	csum = (csum >> 16) + (csum & 0xffff);
2509	csum = (csum >> 16) + (csum & 0xffff);
2510	return (uint16_t)csum;
2511}
2512
2513static inline uint16_t
2514mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2515{
2516	uint32_t partial;
2517	int nxt, cksum_offset;
2518	struct ip6_hdr *ip6 = p;
2519	uint16_t c;
2520
2521	nxt = ip6->ip6_nxt;
2522	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2523	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2524		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2525					   IPPROTO_IPV6, &nxt);
2526		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2527			return (1);
2528	}
2529
2530	/*
2531	 * IPv6 headers do not contain a checksum, and hence
2532	 * do not checksum to zero, so they don't "fall out"
2533	 * of the partial checksum calculation like IPv4
2534	 * headers do.  We need to fix the partial checksum by
2535	 * subtracting the checksum of the IPv6 header.
2536	 */
2537
2538	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2539				    ETHER_HDR_LEN);
2540	csum += ~partial;
2541	csum +=	 (csum < ~partial);
2542	csum = (csum >> 16) + (csum & 0xFFFF);
2543	csum = (csum >> 16) + (csum & 0xFFFF);
2544	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2545			     csum);
2546	c ^= 0xffff;
2547	return (c);
2548}
2549#endif /* INET6 */
2550/*
2551 *  Myri10GE hardware checksums are not valid if the sender
2552 *  padded the frame with non-zero padding.  This is because
2553 *  the firmware just does a simple 16-bit 1s complement
2554 *  checksum across the entire frame, excluding the first 14
2555 *  bytes.  It is best to simply to check the checksum and
2556 *  tell the stack about it only if the checksum is good
2557 */
2558
2559static inline uint16_t
2560mxge_rx_csum(struct mbuf *m, int csum)
2561{
2562	struct ether_header *eh;
2563#ifdef INET
2564	struct ip *ip;
2565#endif
2566#if defined(INET) || defined(INET6)
2567	int cap = m->m_pkthdr.rcvif->if_capenable;
2568#endif
2569	uint16_t c, etype;
2570
2571
2572	eh = mtod(m, struct ether_header *);
2573	etype = ntohs(eh->ether_type);
2574	switch (etype) {
2575#ifdef INET
2576	case ETHERTYPE_IP:
2577		if ((cap & IFCAP_RXCSUM) == 0)
2578			return (1);
2579		ip = (struct ip *)(eh + 1);
2580		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2581			return (1);
2582		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2583			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2584				    (ip->ip_hl << 2) + ip->ip_p));
2585		c ^= 0xffff;
2586		break;
2587#endif
2588#ifdef INET6
2589	case ETHERTYPE_IPV6:
2590		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2591			return (1);
2592		c = mxge_rx_csum6((eh + 1), m, csum);
2593		break;
2594#endif
2595	default:
2596		c = 1;
2597	}
2598	return (c);
2599}
2600
2601static void
2602mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2603{
2604	struct ether_vlan_header *evl;
2605	struct ether_header *eh;
2606	uint32_t partial;
2607
2608	evl = mtod(m, struct ether_vlan_header *);
2609	eh = mtod(m, struct ether_header *);
2610
2611	/*
2612	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2613	 * after what the firmware thought was the end of the ethernet
2614	 * header.
2615	 */
2616
2617	/* put checksum into host byte order */
2618	*csum = ntohs(*csum);
2619	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2620	(*csum) += ~partial;
2621	(*csum) +=  ((*csum) < ~partial);
2622	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2623	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2624
2625	/* restore checksum to network byte order;
2626	   later consumers expect this */
2627	*csum = htons(*csum);
2628
2629	/* save the tag */
2630#ifdef MXGE_NEW_VLAN_API
2631	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2632#else
2633	{
2634		struct m_tag *mtag;
2635		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2636				   M_NOWAIT);
2637		if (mtag == NULL)
2638			return;
2639		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2640		m_tag_prepend(m, mtag);
2641	}
2642
2643#endif
2644	m->m_flags |= M_VLANTAG;
2645
2646	/*
2647	 * Remove the 802.1q header by copying the Ethernet
2648	 * addresses over it and adjusting the beginning of
2649	 * the data in the mbuf.  The encapsulated Ethernet
2650	 * type field is already in place.
2651	 */
2652	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2653	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2654	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2655}
2656
2657
2658static inline void
2659mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2660		 uint32_t csum, int lro)
2661{
2662	mxge_softc_t *sc;
2663	struct ifnet *ifp;
2664	struct mbuf *m;
2665	struct ether_header *eh;
2666	mxge_rx_ring_t *rx;
2667	bus_dmamap_t old_map;
2668	int idx;
2669
2670	sc = ss->sc;
2671	ifp = sc->ifp;
2672	rx = &ss->rx_big;
2673	idx = rx->cnt & rx->mask;
2674	rx->cnt += rx->nbufs;
2675	/* save a pointer to the received mbuf */
2676	m = rx->info[idx].m;
2677	/* try to replace the received mbuf */
2678	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2679		/* drop the frame -- the old mbuf is re-cycled */
2680		ifp->if_ierrors++;
2681		return;
2682	}
2683
2684	/* unmap the received buffer */
2685	old_map = rx->info[idx].map;
2686	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2687	bus_dmamap_unload(rx->dmat, old_map);
2688
2689	/* swap the bus_dmamap_t's */
2690	rx->info[idx].map = rx->extra_map;
2691	rx->extra_map = old_map;
2692
2693	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2694	 * aligned */
2695	m->m_data += MXGEFW_PAD;
2696
2697	m->m_pkthdr.rcvif = ifp;
2698	m->m_len = m->m_pkthdr.len = len;
2699	ss->ipackets++;
2700	eh = mtod(m, struct ether_header *);
2701	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2702		mxge_vlan_tag_remove(m, &csum);
2703	}
2704	/* flowid only valid if RSS hashing is enabled */
2705	if (sc->num_slices > 1) {
2706		m->m_pkthdr.flowid = (ss - sc->ss);
2707		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2708	}
2709	/* if the checksum is valid, mark it in the mbuf header */
2710	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2711	    (0 == mxge_rx_csum(m, csum))) {
2712		/* Tell the stack that the  checksum is good */
2713		m->m_pkthdr.csum_data = 0xffff;
2714		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2715			CSUM_DATA_VALID;
2716
2717#if defined(INET) || defined (INET6)
2718		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2719			return;
2720#endif
2721	}
2722	/* pass the frame up the stack */
2723	(*ifp->if_input)(ifp, m);
2724}
2725
2726static inline void
2727mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2728		   uint32_t csum, int lro)
2729{
2730	mxge_softc_t *sc;
2731	struct ifnet *ifp;
2732	struct ether_header *eh;
2733	struct mbuf *m;
2734	mxge_rx_ring_t *rx;
2735	bus_dmamap_t old_map;
2736	int idx;
2737
2738	sc = ss->sc;
2739	ifp = sc->ifp;
2740	rx = &ss->rx_small;
2741	idx = rx->cnt & rx->mask;
2742	rx->cnt++;
2743	/* save a pointer to the received mbuf */
2744	m = rx->info[idx].m;
2745	/* try to replace the received mbuf */
2746	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2747		/* drop the frame -- the old mbuf is re-cycled */
2748		ifp->if_ierrors++;
2749		return;
2750	}
2751
2752	/* unmap the received buffer */
2753	old_map = rx->info[idx].map;
2754	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2755	bus_dmamap_unload(rx->dmat, old_map);
2756
2757	/* swap the bus_dmamap_t's */
2758	rx->info[idx].map = rx->extra_map;
2759	rx->extra_map = old_map;
2760
2761	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2762	 * aligned */
2763	m->m_data += MXGEFW_PAD;
2764
2765	m->m_pkthdr.rcvif = ifp;
2766	m->m_len = m->m_pkthdr.len = len;
2767	ss->ipackets++;
2768	eh = mtod(m, struct ether_header *);
2769	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2770		mxge_vlan_tag_remove(m, &csum);
2771	}
2772	/* flowid only valid if RSS hashing is enabled */
2773	if (sc->num_slices > 1) {
2774		m->m_pkthdr.flowid = (ss - sc->ss);
2775		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2776	}
2777	/* if the checksum is valid, mark it in the mbuf header */
2778	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2779	    (0 == mxge_rx_csum(m, csum))) {
2780		/* Tell the stack that the  checksum is good */
2781		m->m_pkthdr.csum_data = 0xffff;
2782		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2783			CSUM_DATA_VALID;
2784
2785#if defined(INET) || defined (INET6)
2786		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2787			return;
2788#endif
2789	}
2790	/* pass the frame up the stack */
2791	(*ifp->if_input)(ifp, m);
2792}
2793
2794static inline void
2795mxge_clean_rx_done(struct mxge_slice_state *ss)
2796{
2797	mxge_rx_done_t *rx_done = &ss->rx_done;
2798	int limit = 0;
2799	uint16_t length;
2800	uint16_t checksum;
2801	int lro;
2802
2803	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2804	while (rx_done->entry[rx_done->idx].length != 0) {
2805		length = ntohs(rx_done->entry[rx_done->idx].length);
2806		rx_done->entry[rx_done->idx].length = 0;
2807		checksum = rx_done->entry[rx_done->idx].checksum;
2808		if (length <= (MHLEN - MXGEFW_PAD))
2809			mxge_rx_done_small(ss, length, checksum, lro);
2810		else
2811			mxge_rx_done_big(ss, length, checksum, lro);
2812		rx_done->cnt++;
2813		rx_done->idx = rx_done->cnt & rx_done->mask;
2814
2815		/* limit potential for livelock */
2816		if (__predict_false(++limit > rx_done->mask / 2))
2817			break;
2818	}
2819#if defined(INET)  || defined (INET6)
2820	while (!SLIST_EMPTY(&ss->lc.lro_active)) {
2821		struct lro_entry *lro = SLIST_FIRST(&ss->lc.lro_active);
2822		SLIST_REMOVE_HEAD(&ss->lc.lro_active, next);
2823		tcp_lro_flush(&ss->lc, lro);
2824	}
2825#endif
2826}
2827
2828
2829static inline void
2830mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2831{
2832	struct ifnet *ifp;
2833	mxge_tx_ring_t *tx;
2834	struct mbuf *m;
2835	bus_dmamap_t map;
2836	int idx;
2837	int *flags;
2838
2839	tx = &ss->tx;
2840	ifp = ss->sc->ifp;
2841	while (tx->pkt_done != mcp_idx) {
2842		idx = tx->done & tx->mask;
2843		tx->done++;
2844		m = tx->info[idx].m;
2845		/* mbuf and DMA map only attached to the first
2846		   segment per-mbuf */
2847		if (m != NULL) {
2848			ss->obytes += m->m_pkthdr.len;
2849			if (m->m_flags & M_MCAST)
2850				ss->omcasts++;
2851			ss->opackets++;
2852			tx->info[idx].m = NULL;
2853			map = tx->info[idx].map;
2854			bus_dmamap_unload(tx->dmat, map);
2855			m_freem(m);
2856		}
2857		if (tx->info[idx].flag) {
2858			tx->info[idx].flag = 0;
2859			tx->pkt_done++;
2860		}
2861	}
2862
2863	/* If we have space, clear IFF_OACTIVE to tell the stack that
2864           its OK to send packets */
2865#ifdef IFNET_BUF_RING
2866	flags = &ss->if_drv_flags;
2867#else
2868	flags = &ifp->if_drv_flags;
2869#endif
2870	mtx_lock(&ss->tx.mtx);
2871	if ((*flags) & IFF_DRV_OACTIVE &&
2872	    tx->req - tx->done < (tx->mask + 1)/4) {
2873		*(flags) &= ~IFF_DRV_OACTIVE;
2874		ss->tx.wake++;
2875		mxge_start_locked(ss);
2876	}
2877#ifdef IFNET_BUF_RING
2878	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2879		/* let the NIC stop polling this queue, since there
2880		 * are no more transmits pending */
2881		if (tx->req == tx->done) {
2882			*tx->send_stop = 1;
2883			tx->queue_active = 0;
2884			tx->deactivate++;
2885			wmb();
2886		}
2887	}
2888#endif
2889	mtx_unlock(&ss->tx.mtx);
2890
2891}
2892
2893static struct mxge_media_type mxge_xfp_media_types[] =
2894{
2895	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2896	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2897	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2898	{0,		(1 << 5),	"10GBASE-ER"},
2899	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2900	{0,		(1 << 3),	"10GBASE-SW"},
2901	{0,		(1 << 2),	"10GBASE-LW"},
2902	{0,		(1 << 1),	"10GBASE-EW"},
2903	{0,		(1 << 0),	"Reserved"}
2904};
2905static struct mxge_media_type mxge_sfp_media_types[] =
2906{
2907	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2908	{0,		(1 << 7),	"Reserved"},
2909	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2910	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2911	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2912	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2913};
2914
2915static void
2916mxge_media_set(mxge_softc_t *sc, int media_type)
2917{
2918
2919
2920	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2921		    0, NULL);
2922	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2923	sc->current_media = media_type;
2924	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2925}
2926
2927static void
2928mxge_media_init(mxge_softc_t *sc)
2929{
2930	char *ptr;
2931	int i;
2932
2933	ifmedia_removeall(&sc->media);
2934	mxge_media_set(sc, IFM_AUTO);
2935
2936	/*
2937	 * parse the product code to deterimine the interface type
2938	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2939	 * after the 3rd dash in the driver's cached copy of the
2940	 * EEPROM's product code string.
2941	 */
2942	ptr = sc->product_code_string;
2943	if (ptr == NULL) {
2944		device_printf(sc->dev, "Missing product code\n");
2945		return;
2946	}
2947
2948	for (i = 0; i < 3; i++, ptr++) {
2949		ptr = strchr(ptr, '-');
2950		if (ptr == NULL) {
2951			device_printf(sc->dev,
2952				      "only %d dashes in PC?!?\n", i);
2953			return;
2954		}
2955	}
2956	if (*ptr == 'C' || *(ptr +1) == 'C') {
2957		/* -C is CX4 */
2958		sc->connector = MXGE_CX4;
2959		mxge_media_set(sc, IFM_10G_CX4);
2960	} else if (*ptr == 'Q') {
2961		/* -Q is Quad Ribbon Fiber */
2962		sc->connector = MXGE_QRF;
2963		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2964		/* FreeBSD has no media type for Quad ribbon fiber */
2965	} else if (*ptr == 'R') {
2966		/* -R is XFP */
2967		sc->connector = MXGE_XFP;
2968	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2969		/* -S or -2S is SFP+ */
2970		sc->connector = MXGE_SFP;
2971	} else {
2972		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2973	}
2974}
2975
2976/*
2977 * Determine the media type for a NIC.  Some XFPs will identify
2978 * themselves only when their link is up, so this is initiated via a
2979 * link up interrupt.  However, this can potentially take up to
2980 * several milliseconds, so it is run via the watchdog routine, rather
2981 * than in the interrupt handler itself.
2982 */
2983static void
2984mxge_media_probe(mxge_softc_t *sc)
2985{
2986	mxge_cmd_t cmd;
2987	char *cage_type;
2988
2989	struct mxge_media_type *mxge_media_types = NULL;
2990	int i, err, ms, mxge_media_type_entries;
2991	uint32_t byte;
2992
2993	sc->need_media_probe = 0;
2994
2995	if (sc->connector == MXGE_XFP) {
2996		/* -R is XFP */
2997		mxge_media_types = mxge_xfp_media_types;
2998		mxge_media_type_entries =
2999			sizeof (mxge_xfp_media_types) /
3000			sizeof (mxge_xfp_media_types[0]);
3001		byte = MXGE_XFP_COMPLIANCE_BYTE;
3002		cage_type = "XFP";
3003	} else 	if (sc->connector == MXGE_SFP) {
3004		/* -S or -2S is SFP+ */
3005		mxge_media_types = mxge_sfp_media_types;
3006		mxge_media_type_entries =
3007			sizeof (mxge_sfp_media_types) /
3008			sizeof (mxge_sfp_media_types[0]);
3009		cage_type = "SFP+";
3010		byte = 3;
3011	} else {
3012		/* nothing to do; media type cannot change */
3013		return;
3014	}
3015
3016	/*
3017	 * At this point we know the NIC has an XFP cage, so now we
3018	 * try to determine what is in the cage by using the
3019	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
3020	 * register.  We read just one byte, which may take over
3021	 * a millisecond
3022	 */
3023
3024	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3025	cmd.data1 = byte;
3026	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3027	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3028		device_printf(sc->dev, "failed to read XFP\n");
3029	}
3030	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3031		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3032	}
3033	if (err != MXGEFW_CMD_OK) {
3034		return;
3035	}
3036
3037	/* now we wait for the data to be cached */
3038	cmd.data0 = byte;
3039	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3040	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3041		DELAY(1000);
3042		cmd.data0 = byte;
3043		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3044	}
3045	if (err != MXGEFW_CMD_OK) {
3046		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3047			      cage_type, err, ms);
3048		return;
3049	}
3050
3051	if (cmd.data0 == mxge_media_types[0].bitmask) {
3052		if (mxge_verbose)
3053			device_printf(sc->dev, "%s:%s\n", cage_type,
3054				      mxge_media_types[0].name);
3055		if (sc->current_media != mxge_media_types[0].flag) {
3056			mxge_media_init(sc);
3057			mxge_media_set(sc, mxge_media_types[0].flag);
3058		}
3059		return;
3060	}
3061	for (i = 1; i < mxge_media_type_entries; i++) {
3062		if (cmd.data0 & mxge_media_types[i].bitmask) {
3063			if (mxge_verbose)
3064				device_printf(sc->dev, "%s:%s\n",
3065					      cage_type,
3066					      mxge_media_types[i].name);
3067
3068			if (sc->current_media != mxge_media_types[i].flag) {
3069				mxge_media_init(sc);
3070				mxge_media_set(sc, mxge_media_types[i].flag);
3071			}
3072			return;
3073		}
3074	}
3075	if (mxge_verbose)
3076		device_printf(sc->dev, "%s media 0x%x unknown\n",
3077			      cage_type, cmd.data0);
3078
3079	return;
3080}
3081
3082static void
3083mxge_intr(void *arg)
3084{
3085	struct mxge_slice_state *ss = arg;
3086	mxge_softc_t *sc = ss->sc;
3087	mcp_irq_data_t *stats = ss->fw_stats;
3088	mxge_tx_ring_t *tx = &ss->tx;
3089	mxge_rx_done_t *rx_done = &ss->rx_done;
3090	uint32_t send_done_count;
3091	uint8_t valid;
3092
3093
3094#ifndef IFNET_BUF_RING
3095	/* an interrupt on a non-zero slice is implicitly valid
3096	   since MSI-X irqs are not shared */
3097	if (ss != sc->ss) {
3098		mxge_clean_rx_done(ss);
3099		*ss->irq_claim = be32toh(3);
3100		return;
3101	}
3102#endif
3103
3104	/* make sure the DMA has finished */
3105	if (!stats->valid) {
3106		return;
3107	}
3108	valid = stats->valid;
3109
3110	if (sc->legacy_irq) {
3111		/* lower legacy IRQ  */
3112		*sc->irq_deassert = 0;
3113		if (!mxge_deassert_wait)
3114			/* don't wait for conf. that irq is low */
3115			stats->valid = 0;
3116	} else {
3117		stats->valid = 0;
3118	}
3119
3120	/* loop while waiting for legacy irq deassertion */
3121	do {
3122		/* check for transmit completes and receives */
3123		send_done_count = be32toh(stats->send_done_count);
3124		while ((send_done_count != tx->pkt_done) ||
3125		       (rx_done->entry[rx_done->idx].length != 0)) {
3126			if (send_done_count != tx->pkt_done)
3127				mxge_tx_done(ss, (int)send_done_count);
3128			mxge_clean_rx_done(ss);
3129			send_done_count = be32toh(stats->send_done_count);
3130		}
3131		if (sc->legacy_irq && mxge_deassert_wait)
3132			wmb();
3133	} while (*((volatile uint8_t *) &stats->valid));
3134
3135	/* fw link & error stats meaningful only on the first slice */
3136	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3137		if (sc->link_state != stats->link_up) {
3138			sc->link_state = stats->link_up;
3139			if (sc->link_state) {
3140				if_link_state_change(sc->ifp, LINK_STATE_UP);
3141				if_initbaudrate(sc->ifp, IF_Gbps(10));
3142				if (mxge_verbose)
3143					device_printf(sc->dev, "link up\n");
3144			} else {
3145				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3146				sc->ifp->if_baudrate = 0;
3147				if (mxge_verbose)
3148					device_printf(sc->dev, "link down\n");
3149			}
3150			sc->need_media_probe = 1;
3151		}
3152		if (sc->rdma_tags_available !=
3153		    be32toh(stats->rdma_tags_available)) {
3154			sc->rdma_tags_available =
3155				be32toh(stats->rdma_tags_available);
3156			device_printf(sc->dev, "RDMA timed out! %d tags "
3157				      "left\n", sc->rdma_tags_available);
3158		}
3159
3160		if (stats->link_down) {
3161			sc->down_cnt += stats->link_down;
3162			sc->link_state = 0;
3163			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3164		}
3165	}
3166
3167	/* check to see if we have rx token to pass back */
3168	if (valid & 0x1)
3169	    *ss->irq_claim = be32toh(3);
3170	*(ss->irq_claim + 1) = be32toh(3);
3171}
3172
3173static void
3174mxge_init(void *arg)
3175{
3176	mxge_softc_t *sc = arg;
3177	struct ifnet *ifp = sc->ifp;
3178
3179
3180	mtx_lock(&sc->driver_mtx);
3181	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3182		(void) mxge_open(sc);
3183	mtx_unlock(&sc->driver_mtx);
3184}
3185
3186
3187
3188static void
3189mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3190{
3191	int i;
3192
3193#if defined(INET) || defined(INET6)
3194	tcp_lro_free(&ss->lc);
3195#endif
3196	for (i = 0; i <= ss->rx_big.mask; i++) {
3197		if (ss->rx_big.info[i].m == NULL)
3198			continue;
3199		bus_dmamap_unload(ss->rx_big.dmat,
3200				  ss->rx_big.info[i].map);
3201		m_freem(ss->rx_big.info[i].m);
3202		ss->rx_big.info[i].m = NULL;
3203	}
3204
3205	for (i = 0; i <= ss->rx_small.mask; i++) {
3206		if (ss->rx_small.info[i].m == NULL)
3207			continue;
3208		bus_dmamap_unload(ss->rx_small.dmat,
3209				  ss->rx_small.info[i].map);
3210		m_freem(ss->rx_small.info[i].m);
3211		ss->rx_small.info[i].m = NULL;
3212	}
3213
3214	/* transmit ring used only on the first slice */
3215	if (ss->tx.info == NULL)
3216		return;
3217
3218	for (i = 0; i <= ss->tx.mask; i++) {
3219		ss->tx.info[i].flag = 0;
3220		if (ss->tx.info[i].m == NULL)
3221			continue;
3222		bus_dmamap_unload(ss->tx.dmat,
3223				  ss->tx.info[i].map);
3224		m_freem(ss->tx.info[i].m);
3225		ss->tx.info[i].m = NULL;
3226	}
3227}
3228
3229static void
3230mxge_free_mbufs(mxge_softc_t *sc)
3231{
3232	int slice;
3233
3234	for (slice = 0; slice < sc->num_slices; slice++)
3235		mxge_free_slice_mbufs(&sc->ss[slice]);
3236}
3237
3238static void
3239mxge_free_slice_rings(struct mxge_slice_state *ss)
3240{
3241	int i;
3242
3243
3244	if (ss->rx_done.entry != NULL)
3245		mxge_dma_free(&ss->rx_done.dma);
3246	ss->rx_done.entry = NULL;
3247
3248	if (ss->tx.req_bytes != NULL)
3249		free(ss->tx.req_bytes, M_DEVBUF);
3250	ss->tx.req_bytes = NULL;
3251
3252	if (ss->tx.seg_list != NULL)
3253		free(ss->tx.seg_list, M_DEVBUF);
3254	ss->tx.seg_list = NULL;
3255
3256	if (ss->rx_small.shadow != NULL)
3257		free(ss->rx_small.shadow, M_DEVBUF);
3258	ss->rx_small.shadow = NULL;
3259
3260	if (ss->rx_big.shadow != NULL)
3261		free(ss->rx_big.shadow, M_DEVBUF);
3262	ss->rx_big.shadow = NULL;
3263
3264	if (ss->tx.info != NULL) {
3265		if (ss->tx.dmat != NULL) {
3266			for (i = 0; i <= ss->tx.mask; i++) {
3267				bus_dmamap_destroy(ss->tx.dmat,
3268						   ss->tx.info[i].map);
3269			}
3270			bus_dma_tag_destroy(ss->tx.dmat);
3271		}
3272		free(ss->tx.info, M_DEVBUF);
3273	}
3274	ss->tx.info = NULL;
3275
3276	if (ss->rx_small.info != NULL) {
3277		if (ss->rx_small.dmat != NULL) {
3278			for (i = 0; i <= ss->rx_small.mask; i++) {
3279				bus_dmamap_destroy(ss->rx_small.dmat,
3280						   ss->rx_small.info[i].map);
3281			}
3282			bus_dmamap_destroy(ss->rx_small.dmat,
3283					   ss->rx_small.extra_map);
3284			bus_dma_tag_destroy(ss->rx_small.dmat);
3285		}
3286		free(ss->rx_small.info, M_DEVBUF);
3287	}
3288	ss->rx_small.info = NULL;
3289
3290	if (ss->rx_big.info != NULL) {
3291		if (ss->rx_big.dmat != NULL) {
3292			for (i = 0; i <= ss->rx_big.mask; i++) {
3293				bus_dmamap_destroy(ss->rx_big.dmat,
3294						   ss->rx_big.info[i].map);
3295			}
3296			bus_dmamap_destroy(ss->rx_big.dmat,
3297					   ss->rx_big.extra_map);
3298			bus_dma_tag_destroy(ss->rx_big.dmat);
3299		}
3300		free(ss->rx_big.info, M_DEVBUF);
3301	}
3302	ss->rx_big.info = NULL;
3303}
3304
3305static void
3306mxge_free_rings(mxge_softc_t *sc)
3307{
3308	int slice;
3309
3310	for (slice = 0; slice < sc->num_slices; slice++)
3311		mxge_free_slice_rings(&sc->ss[slice]);
3312}
3313
3314static int
3315mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3316		       int tx_ring_entries)
3317{
3318	mxge_softc_t *sc = ss->sc;
3319	size_t bytes;
3320	int err, i;
3321
3322	/* allocate per-slice receive resources */
3323
3324	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3325	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3326
3327	/* allocate the rx shadow rings */
3328	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3329	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3330
3331	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3332	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3333
3334	/* allocate the rx host info rings */
3335	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3336	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3337
3338	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3339	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3340
3341	/* allocate the rx busdma resources */
3342	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3343				 1,			/* alignment */
3344				 4096,			/* boundary */
3345				 BUS_SPACE_MAXADDR,	/* low */
3346				 BUS_SPACE_MAXADDR,	/* high */
3347				 NULL, NULL,		/* filter */
3348				 MHLEN,			/* maxsize */
3349				 1,			/* num segs */
3350				 MHLEN,			/* maxsegsize */
3351				 BUS_DMA_ALLOCNOW,	/* flags */
3352				 NULL, NULL,		/* lock */
3353				 &ss->rx_small.dmat);	/* tag */
3354	if (err != 0) {
3355		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3356			      err);
3357		return err;
3358	}
3359
3360	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3361				 1,			/* alignment */
3362#if MXGE_VIRT_JUMBOS
3363				 4096,			/* boundary */
3364#else
3365				 0,			/* boundary */
3366#endif
3367				 BUS_SPACE_MAXADDR,	/* low */
3368				 BUS_SPACE_MAXADDR,	/* high */
3369				 NULL, NULL,		/* filter */
3370				 3*4096,		/* maxsize */
3371#if MXGE_VIRT_JUMBOS
3372				 3,			/* num segs */
3373				 4096,			/* maxsegsize*/
3374#else
3375				 1,			/* num segs */
3376				 MJUM9BYTES,		/* maxsegsize*/
3377#endif
3378				 BUS_DMA_ALLOCNOW,	/* flags */
3379				 NULL, NULL,		/* lock */
3380				 &ss->rx_big.dmat);	/* tag */
3381	if (err != 0) {
3382		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3383			      err);
3384		return err;
3385	}
3386	for (i = 0; i <= ss->rx_small.mask; i++) {
3387		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3388					&ss->rx_small.info[i].map);
3389		if (err != 0) {
3390			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3391				      err);
3392			return err;
3393		}
3394	}
3395	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3396				&ss->rx_small.extra_map);
3397	if (err != 0) {
3398		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3399			      err);
3400		return err;
3401	}
3402
3403	for (i = 0; i <= ss->rx_big.mask; i++) {
3404		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3405					&ss->rx_big.info[i].map);
3406		if (err != 0) {
3407			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3408				      err);
3409			return err;
3410		}
3411	}
3412	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3413				&ss->rx_big.extra_map);
3414	if (err != 0) {
3415		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3416			      err);
3417		return err;
3418	}
3419
3420	/* now allocate TX resources */
3421
3422#ifndef IFNET_BUF_RING
3423	/* only use a single TX ring for now */
3424	if (ss != ss->sc->ss)
3425		return 0;
3426#endif
3427
3428	ss->tx.mask = tx_ring_entries - 1;
3429	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3430
3431
3432	/* allocate the tx request copy block */
3433	bytes = 8 +
3434		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3435	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3436	/* ensure req_list entries are aligned to 8 bytes */
3437	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3438		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3439
3440	/* allocate the tx busdma segment list */
3441	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3442	ss->tx.seg_list = (bus_dma_segment_t *)
3443		malloc(bytes, M_DEVBUF, M_WAITOK);
3444
3445	/* allocate the tx host info ring */
3446	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3447	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3448
3449	/* allocate the tx busdma resources */
3450	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3451				 1,			/* alignment */
3452				 sc->tx_boundary,	/* boundary */
3453				 BUS_SPACE_MAXADDR,	/* low */
3454				 BUS_SPACE_MAXADDR,	/* high */
3455				 NULL, NULL,		/* filter */
3456				 65536 + 256,		/* maxsize */
3457				 ss->tx.max_desc - 2,	/* num segs */
3458				 sc->tx_boundary,	/* maxsegsz */
3459				 BUS_DMA_ALLOCNOW,	/* flags */
3460				 NULL, NULL,		/* lock */
3461				 &ss->tx.dmat);		/* tag */
3462
3463	if (err != 0) {
3464		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3465			      err);
3466		return err;
3467	}
3468
3469	/* now use these tags to setup dmamaps for each slot
3470	   in the ring */
3471	for (i = 0; i <= ss->tx.mask; i++) {
3472		err = bus_dmamap_create(ss->tx.dmat, 0,
3473					&ss->tx.info[i].map);
3474		if (err != 0) {
3475			device_printf(sc->dev, "Err %d  tx dmamap\n",
3476				      err);
3477			return err;
3478		}
3479	}
3480	return 0;
3481
3482}
3483
3484static int
3485mxge_alloc_rings(mxge_softc_t *sc)
3486{
3487	mxge_cmd_t cmd;
3488	int tx_ring_size;
3489	int tx_ring_entries, rx_ring_entries;
3490	int err, slice;
3491
3492	/* get ring sizes */
3493	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3494	tx_ring_size = cmd.data0;
3495	if (err != 0) {
3496		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3497		goto abort;
3498	}
3499
3500	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3501	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3502	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3503	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3504	IFQ_SET_READY(&sc->ifp->if_snd);
3505
3506	for (slice = 0; slice < sc->num_slices; slice++) {
3507		err = mxge_alloc_slice_rings(&sc->ss[slice],
3508					     rx_ring_entries,
3509					     tx_ring_entries);
3510		if (err != 0)
3511			goto abort;
3512	}
3513	return 0;
3514
3515abort:
3516	mxge_free_rings(sc);
3517	return err;
3518
3519}
3520
3521
3522static void
3523mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3524{
3525	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3526
3527	if (bufsize < MCLBYTES) {
3528		/* easy, everything fits in a single buffer */
3529		*big_buf_size = MCLBYTES;
3530		*cl_size = MCLBYTES;
3531		*nbufs = 1;
3532		return;
3533	}
3534
3535	if (bufsize < MJUMPAGESIZE) {
3536		/* still easy, everything still fits in a single buffer */
3537		*big_buf_size = MJUMPAGESIZE;
3538		*cl_size = MJUMPAGESIZE;
3539		*nbufs = 1;
3540		return;
3541	}
3542#if MXGE_VIRT_JUMBOS
3543	/* now we need to use virtually contiguous buffers */
3544	*cl_size = MJUM9BYTES;
3545	*big_buf_size = 4096;
3546	*nbufs = mtu / 4096 + 1;
3547	/* needs to be a power of two, so round up */
3548	if (*nbufs == 3)
3549		*nbufs = 4;
3550#else
3551	*cl_size = MJUM9BYTES;
3552	*big_buf_size = MJUM9BYTES;
3553	*nbufs = 1;
3554#endif
3555}
3556
3557static int
3558mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3559{
3560	mxge_softc_t *sc;
3561	mxge_cmd_t cmd;
3562	bus_dmamap_t map;
3563	int err, i, slice;
3564
3565
3566	sc = ss->sc;
3567	slice = ss - sc->ss;
3568
3569#if defined(INET) || defined(INET6)
3570	(void)tcp_lro_init(&ss->lc);
3571#endif
3572	ss->lc.ifp = sc->ifp;
3573
3574	/* get the lanai pointers to the send and receive rings */
3575
3576	err = 0;
3577#ifndef IFNET_BUF_RING
3578	/* We currently only send from the first slice */
3579	if (slice == 0) {
3580#endif
3581		cmd.data0 = slice;
3582		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3583		ss->tx.lanai =
3584			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3585		ss->tx.send_go = (volatile uint32_t *)
3586			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3587		ss->tx.send_stop = (volatile uint32_t *)
3588		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3589#ifndef IFNET_BUF_RING
3590	}
3591#endif
3592	cmd.data0 = slice;
3593	err |= mxge_send_cmd(sc,
3594			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3595	ss->rx_small.lanai =
3596		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3597	cmd.data0 = slice;
3598	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3599	ss->rx_big.lanai =
3600		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3601
3602	if (err != 0) {
3603		device_printf(sc->dev,
3604			      "failed to get ring sizes or locations\n");
3605		return EIO;
3606	}
3607
3608	/* stock receive rings */
3609	for (i = 0; i <= ss->rx_small.mask; i++) {
3610		map = ss->rx_small.info[i].map;
3611		err = mxge_get_buf_small(ss, map, i);
3612		if (err) {
3613			device_printf(sc->dev, "alloced %d/%d smalls\n",
3614				      i, ss->rx_small.mask + 1);
3615			return ENOMEM;
3616		}
3617	}
3618	for (i = 0; i <= ss->rx_big.mask; i++) {
3619		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3620		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3621	}
3622	ss->rx_big.nbufs = nbufs;
3623	ss->rx_big.cl_size = cl_size;
3624	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3625		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3626	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3627		map = ss->rx_big.info[i].map;
3628		err = mxge_get_buf_big(ss, map, i);
3629		if (err) {
3630			device_printf(sc->dev, "alloced %d/%d bigs\n",
3631				      i, ss->rx_big.mask + 1);
3632			return ENOMEM;
3633		}
3634	}
3635	return 0;
3636}
3637
3638static int
3639mxge_open(mxge_softc_t *sc)
3640{
3641	mxge_cmd_t cmd;
3642	int err, big_bytes, nbufs, slice, cl_size, i;
3643	bus_addr_t bus;
3644	volatile uint8_t *itable;
3645	struct mxge_slice_state *ss;
3646
3647	/* Copy the MAC address in case it was overridden */
3648	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3649
3650	err = mxge_reset(sc, 1);
3651	if (err != 0) {
3652		device_printf(sc->dev, "failed to reset\n");
3653		return EIO;
3654	}
3655
3656	if (sc->num_slices > 1) {
3657		/* setup the indirection table */
3658		cmd.data0 = sc->num_slices;
3659		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3660				    &cmd);
3661
3662		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3663				     &cmd);
3664		if (err != 0) {
3665			device_printf(sc->dev,
3666				      "failed to setup rss tables\n");
3667			return err;
3668		}
3669
3670		/* just enable an identity mapping */
3671		itable = sc->sram + cmd.data0;
3672		for (i = 0; i < sc->num_slices; i++)
3673			itable[i] = (uint8_t)i;
3674
3675		cmd.data0 = 1;
3676		cmd.data1 = mxge_rss_hash_type;
3677		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3678		if (err != 0) {
3679			device_printf(sc->dev, "failed to enable slices\n");
3680			return err;
3681		}
3682	}
3683
3684
3685	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3686
3687	cmd.data0 = nbufs;
3688	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3689			    &cmd);
3690	/* error is only meaningful if we're trying to set
3691	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3692	if (err && nbufs > 1) {
3693		device_printf(sc->dev,
3694			      "Failed to set alway-use-n to %d\n",
3695			      nbufs);
3696		return EIO;
3697	}
3698	/* Give the firmware the mtu and the big and small buffer
3699	   sizes.  The firmware wants the big buf size to be a power
3700	   of two. Luckily, FreeBSD's clusters are powers of two */
3701	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3702	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3703	cmd.data0 = MHLEN - MXGEFW_PAD;
3704	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3705			     &cmd);
3706	cmd.data0 = big_bytes;
3707	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3708
3709	if (err != 0) {
3710		device_printf(sc->dev, "failed to setup params\n");
3711		goto abort;
3712	}
3713
3714	/* Now give him the pointer to the stats block */
3715	for (slice = 0;
3716#ifdef IFNET_BUF_RING
3717	     slice < sc->num_slices;
3718#else
3719	     slice < 1;
3720#endif
3721	     slice++) {
3722		ss = &sc->ss[slice];
3723		cmd.data0 =
3724			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3725		cmd.data1 =
3726			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3727		cmd.data2 = sizeof(struct mcp_irq_data);
3728		cmd.data2 |= (slice << 16);
3729		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3730	}
3731
3732	if (err != 0) {
3733		bus = sc->ss->fw_stats_dma.bus_addr;
3734		bus += offsetof(struct mcp_irq_data, send_done_count);
3735		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3736		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3737		err = mxge_send_cmd(sc,
3738				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3739				    &cmd);
3740		/* Firmware cannot support multicast without STATS_DMA_V2 */
3741		sc->fw_multicast_support = 0;
3742	} else {
3743		sc->fw_multicast_support = 1;
3744	}
3745
3746	if (err != 0) {
3747		device_printf(sc->dev, "failed to setup params\n");
3748		goto abort;
3749	}
3750
3751	for (slice = 0; slice < sc->num_slices; slice++) {
3752		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3753		if (err != 0) {
3754			device_printf(sc->dev, "couldn't open slice %d\n",
3755				      slice);
3756			goto abort;
3757		}
3758	}
3759
3760	/* Finally, start the firmware running */
3761	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3762	if (err) {
3763		device_printf(sc->dev, "Couldn't bring up link\n");
3764		goto abort;
3765	}
3766#ifdef IFNET_BUF_RING
3767	for (slice = 0; slice < sc->num_slices; slice++) {
3768		ss = &sc->ss[slice];
3769		ss->if_drv_flags |= IFF_DRV_RUNNING;
3770		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3771	}
3772#endif
3773	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3774	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3775
3776	return 0;
3777
3778
3779abort:
3780	mxge_free_mbufs(sc);
3781
3782	return err;
3783}
3784
3785static int
3786mxge_close(mxge_softc_t *sc, int down)
3787{
3788	mxge_cmd_t cmd;
3789	int err, old_down_cnt;
3790#ifdef IFNET_BUF_RING
3791	struct mxge_slice_state *ss;
3792	int slice;
3793#endif
3794
3795#ifdef IFNET_BUF_RING
3796	for (slice = 0; slice < sc->num_slices; slice++) {
3797		ss = &sc->ss[slice];
3798		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3799	}
3800#endif
3801	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3802	if (!down) {
3803		old_down_cnt = sc->down_cnt;
3804		wmb();
3805		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3806		if (err) {
3807			device_printf(sc->dev,
3808				      "Couldn't bring down link\n");
3809		}
3810		if (old_down_cnt == sc->down_cnt) {
3811			/* wait for down irq */
3812			DELAY(10 * sc->intr_coal_delay);
3813		}
3814		wmb();
3815		if (old_down_cnt == sc->down_cnt) {
3816			device_printf(sc->dev, "never got down irq\n");
3817		}
3818	}
3819	mxge_free_mbufs(sc);
3820
3821	return 0;
3822}
3823
3824static void
3825mxge_setup_cfg_space(mxge_softc_t *sc)
3826{
3827	device_t dev = sc->dev;
3828	int reg;
3829	uint16_t lnk, pectl;
3830
3831	/* find the PCIe link width and set max read request to 4KB*/
3832	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3833		lnk = pci_read_config(dev, reg + 0x12, 2);
3834		sc->link_width = (lnk >> 4) & 0x3f;
3835
3836		if (sc->pectl == 0) {
3837			pectl = pci_read_config(dev, reg + 0x8, 2);
3838			pectl = (pectl & ~0x7000) | (5 << 12);
3839			pci_write_config(dev, reg + 0x8, pectl, 2);
3840			sc->pectl = pectl;
3841		} else {
3842			/* restore saved pectl after watchdog reset */
3843			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3844		}
3845	}
3846
3847	/* Enable DMA and Memory space access */
3848	pci_enable_busmaster(dev);
3849}
3850
3851static uint32_t
3852mxge_read_reboot(mxge_softc_t *sc)
3853{
3854	device_t dev = sc->dev;
3855	uint32_t vs;
3856
3857	/* find the vendor specific offset */
3858	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3859		device_printf(sc->dev,
3860			      "could not find vendor specific offset\n");
3861		return (uint32_t)-1;
3862	}
3863	/* enable read32 mode */
3864	pci_write_config(dev, vs + 0x10, 0x3, 1);
3865	/* tell NIC which register to read */
3866	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3867	return (pci_read_config(dev, vs + 0x14, 4));
3868}
3869
3870static void
3871mxge_watchdog_reset(mxge_softc_t *sc)
3872{
3873	struct pci_devinfo *dinfo;
3874	struct mxge_slice_state *ss;
3875	int err, running, s, num_tx_slices = 1;
3876	uint32_t reboot;
3877	uint16_t cmd;
3878
3879	err = ENXIO;
3880
3881	device_printf(sc->dev, "Watchdog reset!\n");
3882
3883	/*
3884	 * check to see if the NIC rebooted.  If it did, then all of
3885	 * PCI config space has been reset, and things like the
3886	 * busmaster bit will be zero.  If this is the case, then we
3887	 * must restore PCI config space before the NIC can be used
3888	 * again
3889	 */
3890	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3891	if (cmd == 0xffff) {
3892		/*
3893		 * maybe the watchdog caught the NIC rebooting; wait
3894		 * up to 100ms for it to finish.  If it does not come
3895		 * back, then give up
3896		 */
3897		DELAY(1000*100);
3898		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3899		if (cmd == 0xffff) {
3900			device_printf(sc->dev, "NIC disappeared!\n");
3901		}
3902	}
3903	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3904		/* print the reboot status */
3905		reboot = mxge_read_reboot(sc);
3906		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3907			      reboot);
3908		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3909		if (running) {
3910
3911			/*
3912			 * quiesce NIC so that TX routines will not try to
3913			 * xmit after restoration of BAR
3914			 */
3915
3916			/* Mark the link as down */
3917			if (sc->link_state) {
3918				sc->link_state = 0;
3919				if_link_state_change(sc->ifp,
3920						     LINK_STATE_DOWN);
3921			}
3922#ifdef IFNET_BUF_RING
3923			num_tx_slices = sc->num_slices;
3924#endif
3925			/* grab all TX locks to ensure no tx  */
3926			for (s = 0; s < num_tx_slices; s++) {
3927				ss = &sc->ss[s];
3928				mtx_lock(&ss->tx.mtx);
3929			}
3930			mxge_close(sc, 1);
3931		}
3932		/* restore PCI configuration space */
3933		dinfo = device_get_ivars(sc->dev);
3934		pci_cfg_restore(sc->dev, dinfo);
3935
3936		/* and redo any changes we made to our config space */
3937		mxge_setup_cfg_space(sc);
3938
3939		/* reload f/w */
3940		err = mxge_load_firmware(sc, 0);
3941		if (err) {
3942			device_printf(sc->dev,
3943				      "Unable to re-load f/w\n");
3944		}
3945		if (running) {
3946			if (!err)
3947				err = mxge_open(sc);
3948			/* release all TX locks */
3949			for (s = 0; s < num_tx_slices; s++) {
3950				ss = &sc->ss[s];
3951#ifdef IFNET_BUF_RING
3952				mxge_start_locked(ss);
3953#endif
3954				mtx_unlock(&ss->tx.mtx);
3955			}
3956		}
3957		sc->watchdog_resets++;
3958	} else {
3959		device_printf(sc->dev,
3960			      "NIC did not reboot, not resetting\n");
3961		err = 0;
3962	}
3963	if (err) {
3964		device_printf(sc->dev, "watchdog reset failed\n");
3965	} else {
3966		if (sc->dying == 2)
3967			sc->dying = 0;
3968		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3969	}
3970}
3971
3972static void
3973mxge_watchdog_task(void *arg, int pending)
3974{
3975	mxge_softc_t *sc = arg;
3976
3977
3978	mtx_lock(&sc->driver_mtx);
3979	mxge_watchdog_reset(sc);
3980	mtx_unlock(&sc->driver_mtx);
3981}
3982
3983static void
3984mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3985{
3986	tx = &sc->ss[slice].tx;
3987	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3988	device_printf(sc->dev,
3989		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3990		      tx->req, tx->done, tx->queue_active);
3991	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3992			      tx->activate, tx->deactivate);
3993	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3994		      tx->pkt_done,
3995		      be32toh(sc->ss->fw_stats->send_done_count));
3996}
3997
3998static int
3999mxge_watchdog(mxge_softc_t *sc)
4000{
4001	mxge_tx_ring_t *tx;
4002	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
4003	int i, err = 0;
4004
4005	/* see if we have outstanding transmits, which
4006	   have been pending for more than mxge_ticks */
4007	for (i = 0;
4008#ifdef IFNET_BUF_RING
4009	     (i < sc->num_slices) && (err == 0);
4010#else
4011	     (i < 1) && (err == 0);
4012#endif
4013	     i++) {
4014		tx = &sc->ss[i].tx;
4015		if (tx->req != tx->done &&
4016		    tx->watchdog_req != tx->watchdog_done &&
4017		    tx->done == tx->watchdog_done) {
4018			/* check for pause blocking before resetting */
4019			if (tx->watchdog_rx_pause == rx_pause) {
4020				mxge_warn_stuck(sc, tx, i);
4021				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4022				return (ENXIO);
4023			}
4024			else
4025				device_printf(sc->dev, "Flow control blocking "
4026					      "xmits, check link partner\n");
4027		}
4028
4029		tx->watchdog_req = tx->req;
4030		tx->watchdog_done = tx->done;
4031		tx->watchdog_rx_pause = rx_pause;
4032	}
4033
4034	if (sc->need_media_probe)
4035		mxge_media_probe(sc);
4036	return (err);
4037}
4038
4039static u_long
4040mxge_update_stats(mxge_softc_t *sc)
4041{
4042	struct mxge_slice_state *ss;
4043	u_long pkts = 0;
4044	u_long ipackets = 0;
4045	u_long opackets = 0;
4046#ifdef IFNET_BUF_RING
4047	u_long obytes = 0;
4048	u_long omcasts = 0;
4049	u_long odrops = 0;
4050#endif
4051	u_long oerrors = 0;
4052	int slice;
4053
4054	for (slice = 0; slice < sc->num_slices; slice++) {
4055		ss = &sc->ss[slice];
4056		ipackets += ss->ipackets;
4057		opackets += ss->opackets;
4058#ifdef IFNET_BUF_RING
4059		obytes += ss->obytes;
4060		omcasts += ss->omcasts;
4061		odrops += ss->tx.br->br_drops;
4062#endif
4063		oerrors += ss->oerrors;
4064	}
4065	pkts = (ipackets - sc->ifp->if_ipackets);
4066	pkts += (opackets - sc->ifp->if_opackets);
4067	sc->ifp->if_ipackets = ipackets;
4068	sc->ifp->if_opackets = opackets;
4069#ifdef IFNET_BUF_RING
4070	sc->ifp->if_obytes = obytes;
4071	sc->ifp->if_omcasts = omcasts;
4072	sc->ifp->if_snd.ifq_drops = odrops;
4073#endif
4074	sc->ifp->if_oerrors = oerrors;
4075	return pkts;
4076}
4077
4078static void
4079mxge_tick(void *arg)
4080{
4081	mxge_softc_t *sc = arg;
4082	u_long pkts = 0;
4083	int err = 0;
4084	int running, ticks;
4085	uint16_t cmd;
4086
4087	ticks = mxge_ticks;
4088	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4089	if (running) {
4090		/* aggregate stats from different slices */
4091		pkts = mxge_update_stats(sc);
4092		if (!sc->watchdog_countdown) {
4093			err = mxge_watchdog(sc);
4094			sc->watchdog_countdown = 4;
4095		}
4096		sc->watchdog_countdown--;
4097	}
4098	if (pkts == 0) {
4099		/* ensure NIC did not suffer h/w fault while idle */
4100		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4101		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4102			sc->dying = 2;
4103			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4104			err = ENXIO;
4105		}
4106		/* look less often if NIC is idle */
4107		ticks *= 4;
4108	}
4109
4110	if (err == 0)
4111		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4112
4113}
4114
4115static int
4116mxge_media_change(struct ifnet *ifp)
4117{
4118	return EINVAL;
4119}
4120
4121static int
4122mxge_change_mtu(mxge_softc_t *sc, int mtu)
4123{
4124	struct ifnet *ifp = sc->ifp;
4125	int real_mtu, old_mtu;
4126	int err = 0;
4127
4128
4129	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4130	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4131		return EINVAL;
4132	mtx_lock(&sc->driver_mtx);
4133	old_mtu = ifp->if_mtu;
4134	ifp->if_mtu = mtu;
4135	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4136		mxge_close(sc, 0);
4137		err = mxge_open(sc);
4138		if (err != 0) {
4139			ifp->if_mtu = old_mtu;
4140			mxge_close(sc, 0);
4141			(void) mxge_open(sc);
4142		}
4143	}
4144	mtx_unlock(&sc->driver_mtx);
4145	return err;
4146}
4147
4148static void
4149mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4150{
4151	mxge_softc_t *sc = ifp->if_softc;
4152
4153
4154	if (sc == NULL)
4155		return;
4156	ifmr->ifm_status = IFM_AVALID;
4157	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4158	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4159	ifmr->ifm_active |= sc->current_media;
4160}
4161
4162static int
4163mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4164{
4165	mxge_softc_t *sc = ifp->if_softc;
4166	struct ifreq *ifr = (struct ifreq *)data;
4167	int err, mask;
4168
4169	err = 0;
4170	switch (command) {
4171	case SIOCSIFMTU:
4172		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4173		break;
4174
4175	case SIOCSIFFLAGS:
4176		mtx_lock(&sc->driver_mtx);
4177		if (sc->dying) {
4178			mtx_unlock(&sc->driver_mtx);
4179			return EINVAL;
4180		}
4181		if (ifp->if_flags & IFF_UP) {
4182			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4183				err = mxge_open(sc);
4184			} else {
4185				/* take care of promis can allmulti
4186				   flag chages */
4187				mxge_change_promisc(sc,
4188						    ifp->if_flags & IFF_PROMISC);
4189				mxge_set_multicast_list(sc);
4190			}
4191		} else {
4192			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4193				mxge_close(sc, 0);
4194			}
4195		}
4196		mtx_unlock(&sc->driver_mtx);
4197		break;
4198
4199	case SIOCADDMULTI:
4200	case SIOCDELMULTI:
4201		mtx_lock(&sc->driver_mtx);
4202		mxge_set_multicast_list(sc);
4203		mtx_unlock(&sc->driver_mtx);
4204		break;
4205
4206	case SIOCSIFCAP:
4207		mtx_lock(&sc->driver_mtx);
4208		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4209		if (mask & IFCAP_TXCSUM) {
4210			if (IFCAP_TXCSUM & ifp->if_capenable) {
4211				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4212				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4213			} else {
4214				ifp->if_capenable |= IFCAP_TXCSUM;
4215				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4216			}
4217		} else if (mask & IFCAP_RXCSUM) {
4218			if (IFCAP_RXCSUM & ifp->if_capenable) {
4219				ifp->if_capenable &= ~IFCAP_RXCSUM;
4220			} else {
4221				ifp->if_capenable |= IFCAP_RXCSUM;
4222			}
4223		}
4224		if (mask & IFCAP_TSO4) {
4225			if (IFCAP_TSO4 & ifp->if_capenable) {
4226				ifp->if_capenable &= ~IFCAP_TSO4;
4227			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4228				ifp->if_capenable |= IFCAP_TSO4;
4229				ifp->if_hwassist |= CSUM_TSO;
4230			} else {
4231				printf("mxge requires tx checksum offload"
4232				       " be enabled to use TSO\n");
4233				err = EINVAL;
4234			}
4235		}
4236#if IFCAP_TSO6
4237		if (mask & IFCAP_TXCSUM_IPV6) {
4238			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4239				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4240						       | IFCAP_TSO6);
4241				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4242						      | CSUM_UDP);
4243			} else {
4244				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4245				ifp->if_hwassist |= (CSUM_TCP_IPV6
4246						     | CSUM_UDP_IPV6);
4247			}
4248		} else if (mask & IFCAP_RXCSUM_IPV6) {
4249			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4250				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4251			} else {
4252				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4253			}
4254		}
4255		if (mask & IFCAP_TSO6) {
4256			if (IFCAP_TSO6 & ifp->if_capenable) {
4257				ifp->if_capenable &= ~IFCAP_TSO6;
4258			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4259				ifp->if_capenable |= IFCAP_TSO6;
4260				ifp->if_hwassist |= CSUM_TSO;
4261			} else {
4262				printf("mxge requires tx checksum offload"
4263				       " be enabled to use TSO\n");
4264				err = EINVAL;
4265			}
4266		}
4267#endif /*IFCAP_TSO6 */
4268
4269		if (mask & IFCAP_LRO)
4270			ifp->if_capenable ^= IFCAP_LRO;
4271		if (mask & IFCAP_VLAN_HWTAGGING)
4272			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4273		if (mask & IFCAP_VLAN_HWTSO)
4274			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4275
4276		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4277		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4278			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4279
4280		mtx_unlock(&sc->driver_mtx);
4281		VLAN_CAPABILITIES(ifp);
4282
4283		break;
4284
4285	case SIOCGIFMEDIA:
4286		mtx_lock(&sc->driver_mtx);
4287		mxge_media_probe(sc);
4288		mtx_unlock(&sc->driver_mtx);
4289		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4290				    &sc->media, command);
4291                break;
4292
4293	default:
4294		err = ether_ioctl(ifp, command, data);
4295		break;
4296        }
4297	return err;
4298}
4299
4300static void
4301mxge_fetch_tunables(mxge_softc_t *sc)
4302{
4303
4304	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4305	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4306			  &mxge_flow_control);
4307	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4308			  &mxge_intr_coal_delay);
4309	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4310			  &mxge_nvidia_ecrc_enable);
4311	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4312			  &mxge_force_firmware);
4313	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4314			  &mxge_deassert_wait);
4315	TUNABLE_INT_FETCH("hw.mxge.verbose",
4316			  &mxge_verbose);
4317	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4318	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4319	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4320	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4321	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4322	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4323
4324	if (bootverbose)
4325		mxge_verbose = 1;
4326	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4327		mxge_intr_coal_delay = 30;
4328	if (mxge_ticks == 0)
4329		mxge_ticks = hz / 2;
4330	sc->pause = mxge_flow_control;
4331	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4332	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4333		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4334	}
4335	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4336	    mxge_initial_mtu < ETHER_MIN_LEN)
4337		mxge_initial_mtu = ETHERMTU_JUMBO;
4338
4339	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4340		mxge_throttle = MXGE_MAX_THROTTLE;
4341	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4342		mxge_throttle = MXGE_MIN_THROTTLE;
4343	sc->throttle = mxge_throttle;
4344}
4345
4346
4347static void
4348mxge_free_slices(mxge_softc_t *sc)
4349{
4350	struct mxge_slice_state *ss;
4351	int i;
4352
4353
4354	if (sc->ss == NULL)
4355		return;
4356
4357	for (i = 0; i < sc->num_slices; i++) {
4358		ss = &sc->ss[i];
4359		if (ss->fw_stats != NULL) {
4360			mxge_dma_free(&ss->fw_stats_dma);
4361			ss->fw_stats = NULL;
4362#ifdef IFNET_BUF_RING
4363			if (ss->tx.br != NULL) {
4364				drbr_free(ss->tx.br, M_DEVBUF);
4365				ss->tx.br = NULL;
4366			}
4367#endif
4368			mtx_destroy(&ss->tx.mtx);
4369		}
4370		if (ss->rx_done.entry != NULL) {
4371			mxge_dma_free(&ss->rx_done.dma);
4372			ss->rx_done.entry = NULL;
4373		}
4374	}
4375	free(sc->ss, M_DEVBUF);
4376	sc->ss = NULL;
4377}
4378
4379static int
4380mxge_alloc_slices(mxge_softc_t *sc)
4381{
4382	mxge_cmd_t cmd;
4383	struct mxge_slice_state *ss;
4384	size_t bytes;
4385	int err, i, max_intr_slots;
4386
4387	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4388	if (err != 0) {
4389		device_printf(sc->dev, "Cannot determine rx ring size\n");
4390		return err;
4391	}
4392	sc->rx_ring_size = cmd.data0;
4393	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4394
4395	bytes = sizeof (*sc->ss) * sc->num_slices;
4396	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4397	if (sc->ss == NULL)
4398		return (ENOMEM);
4399	for (i = 0; i < sc->num_slices; i++) {
4400		ss = &sc->ss[i];
4401
4402		ss->sc = sc;
4403
4404		/* allocate per-slice rx interrupt queues */
4405
4406		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4407		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4408		if (err != 0)
4409			goto abort;
4410		ss->rx_done.entry = ss->rx_done.dma.addr;
4411		bzero(ss->rx_done.entry, bytes);
4412
4413		/*
4414		 * allocate the per-slice firmware stats; stats
4415		 * (including tx) are used used only on the first
4416		 * slice for now
4417		 */
4418#ifndef IFNET_BUF_RING
4419		if (i > 0)
4420			continue;
4421#endif
4422
4423		bytes = sizeof (*ss->fw_stats);
4424		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4425				     sizeof (*ss->fw_stats), 64);
4426		if (err != 0)
4427			goto abort;
4428		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4429		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4430			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4431		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4432#ifdef IFNET_BUF_RING
4433		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4434					   &ss->tx.mtx);
4435#endif
4436	}
4437
4438	return (0);
4439
4440abort:
4441	mxge_free_slices(sc);
4442	return (ENOMEM);
4443}
4444
4445static void
4446mxge_slice_probe(mxge_softc_t *sc)
4447{
4448	mxge_cmd_t cmd;
4449	char *old_fw;
4450	int msix_cnt, status, max_intr_slots;
4451
4452	sc->num_slices = 1;
4453	/*
4454	 *  don't enable multiple slices if they are not enabled,
4455	 *  or if this is not an SMP system
4456	 */
4457
4458	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4459		return;
4460
4461	/* see how many MSI-X interrupts are available */
4462	msix_cnt = pci_msix_count(sc->dev);
4463	if (msix_cnt < 2)
4464		return;
4465
4466	/* now load the slice aware firmware see what it supports */
4467	old_fw = sc->fw_name;
4468	if (old_fw == mxge_fw_aligned)
4469		sc->fw_name = mxge_fw_rss_aligned;
4470	else
4471		sc->fw_name = mxge_fw_rss_unaligned;
4472	status = mxge_load_firmware(sc, 0);
4473	if (status != 0) {
4474		device_printf(sc->dev, "Falling back to a single slice\n");
4475		return;
4476	}
4477
4478	/* try to send a reset command to the card to see if it
4479	   is alive */
4480	memset(&cmd, 0, sizeof (cmd));
4481	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4482	if (status != 0) {
4483		device_printf(sc->dev, "failed reset\n");
4484		goto abort_with_fw;
4485	}
4486
4487	/* get rx ring size */
4488	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4489	if (status != 0) {
4490		device_printf(sc->dev, "Cannot determine rx ring size\n");
4491		goto abort_with_fw;
4492	}
4493	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4494
4495	/* tell it the size of the interrupt queues */
4496	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4497	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4498	if (status != 0) {
4499		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4500		goto abort_with_fw;
4501	}
4502
4503	/* ask the maximum number of slices it supports */
4504	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4505	if (status != 0) {
4506		device_printf(sc->dev,
4507			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4508		goto abort_with_fw;
4509	}
4510	sc->num_slices = cmd.data0;
4511	if (sc->num_slices > msix_cnt)
4512		sc->num_slices = msix_cnt;
4513
4514	if (mxge_max_slices == -1) {
4515		/* cap to number of CPUs in system */
4516		if (sc->num_slices > mp_ncpus)
4517			sc->num_slices = mp_ncpus;
4518	} else {
4519		if (sc->num_slices > mxge_max_slices)
4520			sc->num_slices = mxge_max_slices;
4521	}
4522	/* make sure it is a power of two */
4523	while (sc->num_slices & (sc->num_slices - 1))
4524		sc->num_slices--;
4525
4526	if (mxge_verbose)
4527		device_printf(sc->dev, "using %d slices\n",
4528			      sc->num_slices);
4529
4530	return;
4531
4532abort_with_fw:
4533	sc->fw_name = old_fw;
4534	(void) mxge_load_firmware(sc, 0);
4535}
4536
4537static int
4538mxge_add_msix_irqs(mxge_softc_t *sc)
4539{
4540	size_t bytes;
4541	int count, err, i, rid;
4542
4543	rid = PCIR_BAR(2);
4544	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4545						    &rid, RF_ACTIVE);
4546
4547	if (sc->msix_table_res == NULL) {
4548		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4549		return ENXIO;
4550	}
4551
4552	count = sc->num_slices;
4553	err = pci_alloc_msix(sc->dev, &count);
4554	if (err != 0) {
4555		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4556			      "err = %d \n", sc->num_slices, err);
4557		goto abort_with_msix_table;
4558	}
4559	if (count < sc->num_slices) {
4560		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4561			      count, sc->num_slices);
4562		device_printf(sc->dev,
4563			      "Try setting hw.mxge.max_slices to %d\n",
4564			      count);
4565		err = ENOSPC;
4566		goto abort_with_msix;
4567	}
4568	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4569	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4570	if (sc->msix_irq_res == NULL) {
4571		err = ENOMEM;
4572		goto abort_with_msix;
4573	}
4574
4575	for (i = 0; i < sc->num_slices; i++) {
4576		rid = i + 1;
4577		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4578							  SYS_RES_IRQ,
4579							  &rid, RF_ACTIVE);
4580		if (sc->msix_irq_res[i] == NULL) {
4581			device_printf(sc->dev, "couldn't allocate IRQ res"
4582				      " for message %d\n", i);
4583			err = ENXIO;
4584			goto abort_with_res;
4585		}
4586	}
4587
4588	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4589	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4590
4591	for (i = 0; i < sc->num_slices; i++) {
4592		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4593				     INTR_TYPE_NET | INTR_MPSAFE,
4594#if __FreeBSD_version > 700030
4595				     NULL,
4596#endif
4597				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4598		if (err != 0) {
4599			device_printf(sc->dev, "couldn't setup intr for "
4600				      "message %d\n", i);
4601			goto abort_with_intr;
4602		}
4603		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4604				  sc->msix_ih[i], "s%d", i);
4605	}
4606
4607	if (mxge_verbose) {
4608		device_printf(sc->dev, "using %d msix IRQs:",
4609			      sc->num_slices);
4610		for (i = 0; i < sc->num_slices; i++)
4611			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4612		printf("\n");
4613	}
4614	return (0);
4615
4616abort_with_intr:
4617	for (i = 0; i < sc->num_slices; i++) {
4618		if (sc->msix_ih[i] != NULL) {
4619			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4620					  sc->msix_ih[i]);
4621			sc->msix_ih[i] = NULL;
4622		}
4623	}
4624	free(sc->msix_ih, M_DEVBUF);
4625
4626
4627abort_with_res:
4628	for (i = 0; i < sc->num_slices; i++) {
4629		rid = i + 1;
4630		if (sc->msix_irq_res[i] != NULL)
4631			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4632					     sc->msix_irq_res[i]);
4633		sc->msix_irq_res[i] = NULL;
4634	}
4635	free(sc->msix_irq_res, M_DEVBUF);
4636
4637
4638abort_with_msix:
4639	pci_release_msi(sc->dev);
4640
4641abort_with_msix_table:
4642	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4643			     sc->msix_table_res);
4644
4645	return err;
4646}
4647
4648static int
4649mxge_add_single_irq(mxge_softc_t *sc)
4650{
4651	int count, err, rid;
4652
4653	count = pci_msi_count(sc->dev);
4654	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4655		rid = 1;
4656	} else {
4657		rid = 0;
4658		sc->legacy_irq = 1;
4659	}
4660	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4661					 1, RF_SHAREABLE | RF_ACTIVE);
4662	if (sc->irq_res == NULL) {
4663		device_printf(sc->dev, "could not alloc interrupt\n");
4664		return ENXIO;
4665	}
4666	if (mxge_verbose)
4667		device_printf(sc->dev, "using %s irq %ld\n",
4668			      sc->legacy_irq ? "INTx" : "MSI",
4669			      rman_get_start(sc->irq_res));
4670	err = bus_setup_intr(sc->dev, sc->irq_res,
4671			     INTR_TYPE_NET | INTR_MPSAFE,
4672#if __FreeBSD_version > 700030
4673			     NULL,
4674#endif
4675			     mxge_intr, &sc->ss[0], &sc->ih);
4676	if (err != 0) {
4677		bus_release_resource(sc->dev, SYS_RES_IRQ,
4678				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4679		if (!sc->legacy_irq)
4680			pci_release_msi(sc->dev);
4681	}
4682	return err;
4683}
4684
4685static void
4686mxge_rem_msix_irqs(mxge_softc_t *sc)
4687{
4688	int i, rid;
4689
4690	for (i = 0; i < sc->num_slices; i++) {
4691		if (sc->msix_ih[i] != NULL) {
4692			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4693					  sc->msix_ih[i]);
4694			sc->msix_ih[i] = NULL;
4695		}
4696	}
4697	free(sc->msix_ih, M_DEVBUF);
4698
4699	for (i = 0; i < sc->num_slices; i++) {
4700		rid = i + 1;
4701		if (sc->msix_irq_res[i] != NULL)
4702			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4703					     sc->msix_irq_res[i]);
4704		sc->msix_irq_res[i] = NULL;
4705	}
4706	free(sc->msix_irq_res, M_DEVBUF);
4707
4708	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4709			     sc->msix_table_res);
4710
4711	pci_release_msi(sc->dev);
4712	return;
4713}
4714
4715static void
4716mxge_rem_single_irq(mxge_softc_t *sc)
4717{
4718	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4719	bus_release_resource(sc->dev, SYS_RES_IRQ,
4720			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4721	if (!sc->legacy_irq)
4722		pci_release_msi(sc->dev);
4723}
4724
4725static void
4726mxge_rem_irq(mxge_softc_t *sc)
4727{
4728	if (sc->num_slices > 1)
4729		mxge_rem_msix_irqs(sc);
4730	else
4731		mxge_rem_single_irq(sc);
4732}
4733
4734static int
4735mxge_add_irq(mxge_softc_t *sc)
4736{
4737	int err;
4738
4739	if (sc->num_slices > 1)
4740		err = mxge_add_msix_irqs(sc);
4741	else
4742		err = mxge_add_single_irq(sc);
4743
4744	if (0 && err == 0 && sc->num_slices > 1) {
4745		mxge_rem_msix_irqs(sc);
4746		err = mxge_add_msix_irqs(sc);
4747	}
4748	return err;
4749}
4750
4751
4752static int
4753mxge_attach(device_t dev)
4754{
4755	mxge_cmd_t cmd;
4756	mxge_softc_t *sc = device_get_softc(dev);
4757	struct ifnet *ifp;
4758	int err, rid;
4759
4760	sc->dev = dev;
4761	mxge_fetch_tunables(sc);
4762
4763	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4764	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4765				  taskqueue_thread_enqueue, &sc->tq);
4766	if (sc->tq == NULL) {
4767		err = ENOMEM;
4768		goto abort_with_nothing;
4769	}
4770
4771	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4772				 1,			/* alignment */
4773				 0,			/* boundary */
4774				 BUS_SPACE_MAXADDR,	/* low */
4775				 BUS_SPACE_MAXADDR,	/* high */
4776				 NULL, NULL,		/* filter */
4777				 65536 + 256,		/* maxsize */
4778				 MXGE_MAX_SEND_DESC, 	/* num segs */
4779				 65536,			/* maxsegsize */
4780				 0,			/* flags */
4781				 NULL, NULL,		/* lock */
4782				 &sc->parent_dmat);	/* tag */
4783
4784	if (err != 0) {
4785		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4786			      err);
4787		goto abort_with_tq;
4788	}
4789
4790	ifp = sc->ifp = if_alloc(IFT_ETHER);
4791	if (ifp == NULL) {
4792		device_printf(dev, "can not if_alloc()\n");
4793		err = ENOSPC;
4794		goto abort_with_parent_dmat;
4795	}
4796	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4797
4798	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4799		 device_get_nameunit(dev));
4800	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4801	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4802		 "%s:drv", device_get_nameunit(dev));
4803	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4804		 MTX_NETWORK_LOCK, MTX_DEF);
4805
4806	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4807
4808	mxge_setup_cfg_space(sc);
4809
4810	/* Map the board into the kernel */
4811	rid = PCIR_BARS;
4812	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4813					 ~0, 1, RF_ACTIVE);
4814	if (sc->mem_res == NULL) {
4815		device_printf(dev, "could not map memory\n");
4816		err = ENXIO;
4817		goto abort_with_lock;
4818	}
4819	sc->sram = rman_get_virtual(sc->mem_res);
4820	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4821	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4822		device_printf(dev, "impossible memory region size %ld\n",
4823			      rman_get_size(sc->mem_res));
4824		err = ENXIO;
4825		goto abort_with_mem_res;
4826	}
4827
4828	/* make NULL terminated copy of the EEPROM strings section of
4829	   lanai SRAM */
4830	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4831	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4832				rman_get_bushandle(sc->mem_res),
4833				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4834				sc->eeprom_strings,
4835				MXGE_EEPROM_STRINGS_SIZE - 2);
4836	err = mxge_parse_strings(sc);
4837	if (err != 0)
4838		goto abort_with_mem_res;
4839
4840	/* Enable write combining for efficient use of PCIe bus */
4841	mxge_enable_wc(sc);
4842
4843	/* Allocate the out of band dma memory */
4844	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4845			     sizeof (mxge_cmd_t), 64);
4846	if (err != 0)
4847		goto abort_with_mem_res;
4848	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4849	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4850	if (err != 0)
4851		goto abort_with_cmd_dma;
4852
4853	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4854	if (err != 0)
4855		goto abort_with_zeropad_dma;
4856
4857	/* select & load the firmware */
4858	err = mxge_select_firmware(sc);
4859	if (err != 0)
4860		goto abort_with_dmabench;
4861	sc->intr_coal_delay = mxge_intr_coal_delay;
4862
4863	mxge_slice_probe(sc);
4864	err = mxge_alloc_slices(sc);
4865	if (err != 0)
4866		goto abort_with_dmabench;
4867
4868	err = mxge_reset(sc, 0);
4869	if (err != 0)
4870		goto abort_with_slices;
4871
4872	err = mxge_alloc_rings(sc);
4873	if (err != 0) {
4874		device_printf(sc->dev, "failed to allocate rings\n");
4875		goto abort_with_slices;
4876	}
4877
4878	err = mxge_add_irq(sc);
4879	if (err != 0) {
4880		device_printf(sc->dev, "failed to add irq\n");
4881		goto abort_with_rings;
4882	}
4883
4884	if_initbaudrate(ifp, IF_Gbps(10));
4885	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4886		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4887		IFCAP_RXCSUM_IPV6;
4888#if defined(INET) || defined(INET6)
4889	ifp->if_capabilities |= IFCAP_LRO;
4890#endif
4891
4892#ifdef MXGE_NEW_VLAN_API
4893	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4894
4895	/* Only FW 1.4.32 and newer can do TSO over vlans */
4896	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4897	    sc->fw_ver_tiny >= 32)
4898		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4899#endif
4900	sc->max_mtu = mxge_max_mtu(sc);
4901	if (sc->max_mtu >= 9000)
4902		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4903	else
4904		device_printf(dev, "MTU limited to %d.  Install "
4905			      "latest firmware for 9000 byte jumbo support\n",
4906			      sc->max_mtu - ETHER_HDR_LEN);
4907	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4908	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4909	/* check to see if f/w supports TSO for IPv6 */
4910	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4911		if (CSUM_TCP_IPV6)
4912			ifp->if_capabilities |= IFCAP_TSO6;
4913		sc->max_tso6_hlen = min(cmd.data0,
4914					sizeof (sc->ss[0].scratch));
4915	}
4916	ifp->if_capenable = ifp->if_capabilities;
4917	if (sc->lro_cnt == 0)
4918		ifp->if_capenable &= ~IFCAP_LRO;
4919        ifp->if_init = mxge_init;
4920        ifp->if_softc = sc;
4921        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4922        ifp->if_ioctl = mxge_ioctl;
4923        ifp->if_start = mxge_start;
4924	/* Initialise the ifmedia structure */
4925	ifmedia_init(&sc->media, 0, mxge_media_change,
4926		     mxge_media_status);
4927	mxge_media_init(sc);
4928	mxge_media_probe(sc);
4929	sc->dying = 0;
4930	ether_ifattach(ifp, sc->mac_addr);
4931	/* ether_ifattach sets mtu to ETHERMTU */
4932	if (mxge_initial_mtu != ETHERMTU)
4933		mxge_change_mtu(sc, mxge_initial_mtu);
4934
4935	mxge_add_sysctls(sc);
4936#ifdef IFNET_BUF_RING
4937	ifp->if_transmit = mxge_transmit;
4938	ifp->if_qflush = mxge_qflush;
4939#endif
4940	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4941				device_get_nameunit(sc->dev));
4942	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4943	return 0;
4944
4945abort_with_rings:
4946	mxge_free_rings(sc);
4947abort_with_slices:
4948	mxge_free_slices(sc);
4949abort_with_dmabench:
4950	mxge_dma_free(&sc->dmabench_dma);
4951abort_with_zeropad_dma:
4952	mxge_dma_free(&sc->zeropad_dma);
4953abort_with_cmd_dma:
4954	mxge_dma_free(&sc->cmd_dma);
4955abort_with_mem_res:
4956	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4957abort_with_lock:
4958	pci_disable_busmaster(dev);
4959	mtx_destroy(&sc->cmd_mtx);
4960	mtx_destroy(&sc->driver_mtx);
4961	if_free(ifp);
4962abort_with_parent_dmat:
4963	bus_dma_tag_destroy(sc->parent_dmat);
4964abort_with_tq:
4965	if (sc->tq != NULL) {
4966		taskqueue_drain(sc->tq, &sc->watchdog_task);
4967		taskqueue_free(sc->tq);
4968		sc->tq = NULL;
4969	}
4970abort_with_nothing:
4971	return err;
4972}
4973
4974static int
4975mxge_detach(device_t dev)
4976{
4977	mxge_softc_t *sc = device_get_softc(dev);
4978
4979	if (mxge_vlans_active(sc)) {
4980		device_printf(sc->dev,
4981			      "Detach vlans before removing module\n");
4982		return EBUSY;
4983	}
4984	mtx_lock(&sc->driver_mtx);
4985	sc->dying = 1;
4986	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4987		mxge_close(sc, 0);
4988	mtx_unlock(&sc->driver_mtx);
4989	ether_ifdetach(sc->ifp);
4990	if (sc->tq != NULL) {
4991		taskqueue_drain(sc->tq, &sc->watchdog_task);
4992		taskqueue_free(sc->tq);
4993		sc->tq = NULL;
4994	}
4995	callout_drain(&sc->co_hdl);
4996	ifmedia_removeall(&sc->media);
4997	mxge_dummy_rdma(sc, 0);
4998	mxge_rem_sysctls(sc);
4999	mxge_rem_irq(sc);
5000	mxge_free_rings(sc);
5001	mxge_free_slices(sc);
5002	mxge_dma_free(&sc->dmabench_dma);
5003	mxge_dma_free(&sc->zeropad_dma);
5004	mxge_dma_free(&sc->cmd_dma);
5005	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5006	pci_disable_busmaster(dev);
5007	mtx_destroy(&sc->cmd_mtx);
5008	mtx_destroy(&sc->driver_mtx);
5009	if_free(sc->ifp);
5010	bus_dma_tag_destroy(sc->parent_dmat);
5011	return 0;
5012}
5013
5014static int
5015mxge_shutdown(device_t dev)
5016{
5017	return 0;
5018}
5019
5020/*
5021  This file uses Myri10GE driver indentation.
5022
5023  Local Variables:
5024  c-file-style:"linux"
5025  tab-width:8
5026  End:
5027*/
5028