1/******************************************************************************
2SPDX-License-Identifier: BSD-2-Clause
3
4Copyright (c) 2006-2013, Myricom Inc.
5All rights reserved.
6
7Redistribution and use in source and binary forms, with or without
8modification, are permitted provided that the following conditions are met:
9
10 1. Redistributions of source code must retain the above copyright notice,
11    this list of conditions and the following disclaimer.
12
13 2. Neither the name of the Myricom Inc, nor the names of its
14    contributors may be used to endorse or promote products derived from
15    this software without specific prior written permission.
16
17THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27POSSIBILITY OF SUCH DAMAGE.
28
29***************************************************************************/
30
31#include <sys/param.h>
32#include <sys/systm.h>
33#include <sys/linker.h>
34#include <sys/firmware.h>
35#include <sys/endian.h>
36#include <sys/sockio.h>
37#include <sys/mbuf.h>
38#include <sys/malloc.h>
39#include <sys/kdb.h>
40#include <sys/kernel.h>
41#include <sys/lock.h>
42#include <sys/module.h>
43#include <sys/socket.h>
44#include <sys/sysctl.h>
45#include <sys/sx.h>
46#include <sys/taskqueue.h>
47#include <contrib/zlib/zlib.h>
48#include <dev/zlib/zcalloc.h>
49
50#include <net/if.h>
51#include <net/if_var.h>
52#include <net/if_arp.h>
53#include <net/ethernet.h>
54#include <net/if_dl.h>
55#include <net/if_media.h>
56
57#include <net/bpf.h>
58
59#include <net/if_types.h>
60#include <net/if_vlan_var.h>
61
62#include <netinet/in_systm.h>
63#include <netinet/in.h>
64#include <netinet/ip.h>
65#include <netinet/ip6.h>
66#include <netinet/tcp.h>
67#include <netinet/tcp_lro.h>
68#include <netinet6/ip6_var.h>
69
70#include <machine/bus.h>
71#include <machine/in_cksum.h>
72#include <machine/resource.h>
73#include <sys/bus.h>
74#include <sys/rman.h>
75#include <sys/smp.h>
76
77#include <dev/pci/pcireg.h>
78#include <dev/pci/pcivar.h>
79#include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
80
81#include <vm/vm.h>		/* for pmap_mapdev() */
82#include <vm/pmap.h>
83
84#if defined(__i386) || defined(__amd64)
85#include <machine/specialreg.h>
86#endif
87
88#include <dev/mxge/mxge_mcp.h>
89#include <dev/mxge/mcp_gen_header.h>
90/*#define MXGE_FAKE_IFP*/
91#include <dev/mxge/if_mxge_var.h>
92#include <sys/buf_ring.h>
93
94#include "opt_inet.h"
95#include "opt_inet6.h"
96
97/* tunable params */
98static int mxge_nvidia_ecrc_enable = 1;
99static int mxge_force_firmware = 0;
100static int mxge_intr_coal_delay = 30;
101static int mxge_deassert_wait = 1;
102static int mxge_flow_control = 1;
103static int mxge_verbose = 0;
104static int mxge_ticks;
105static int mxge_max_slices = 1;
106static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
107static int mxge_always_promisc = 0;
108static int mxge_initial_mtu = ETHERMTU_JUMBO;
109static int mxge_throttle = 0;
110static char *mxge_fw_unaligned = "mxge_ethp_z8e";
111static char *mxge_fw_aligned = "mxge_eth_z8e";
112static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
113static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
114
115static int mxge_probe(device_t dev);
116static int mxge_attach(device_t dev);
117static int mxge_detach(device_t dev);
118static int mxge_shutdown(device_t dev);
119static void mxge_intr(void *arg);
120
121static device_method_t mxge_methods[] =
122{
123  /* Device interface */
124  DEVMETHOD(device_probe, mxge_probe),
125  DEVMETHOD(device_attach, mxge_attach),
126  DEVMETHOD(device_detach, mxge_detach),
127  DEVMETHOD(device_shutdown, mxge_shutdown),
128
129  DEVMETHOD_END
130};
131
132static driver_t mxge_driver =
133{
134  "mxge",
135  mxge_methods,
136  sizeof(mxge_softc_t),
137};
138
139/* Declare ourselves to be a child of the PCI bus.*/
140DRIVER_MODULE(mxge, pci, mxge_driver, 0, 0);
141MODULE_DEPEND(mxge, firmware, 1, 1, 1);
142MODULE_DEPEND(mxge, zlib, 1, 1, 1);
143
144static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
145static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
146static int mxge_close(mxge_softc_t *sc, int down);
147static int mxge_open(mxge_softc_t *sc);
148static void mxge_tick(void *arg);
149
150static int
151mxge_probe(device_t dev)
152{
153	int rev;
154
155	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
156	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
157	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
158		rev = pci_get_revid(dev);
159		switch (rev) {
160		case MXGE_PCI_REV_Z8E:
161			device_set_desc(dev, "Myri10G-PCIE-8A");
162			break;
163		case MXGE_PCI_REV_Z8ES:
164			device_set_desc(dev, "Myri10G-PCIE-8B");
165			break;
166		default:
167			device_set_desc(dev, "Myri10G-PCIE-8??");
168			device_printf(dev, "Unrecognized rev %d NIC\n",
169				      rev);
170			break;
171		}
172		return 0;
173	}
174	return ENXIO;
175}
176
177static void
178mxge_enable_wc(mxge_softc_t *sc)
179{
180#if defined(__i386) || defined(__amd64)
181	vm_offset_t len;
182	int err;
183
184	sc->wc = 1;
185	len = rman_get_size(sc->mem_res);
186	err = pmap_change_attr((vm_offset_t) sc->sram,
187			       len, PAT_WRITE_COMBINING);
188	if (err != 0) {
189		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
190			      err);
191		sc->wc = 0;
192	}
193#endif
194}
195
196/* callback to get our DMA address */
197static void
198mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
199			 int error)
200{
201	if (error == 0) {
202		*(bus_addr_t *) arg = segs->ds_addr;
203	}
204}
205
206static int
207mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
208		   bus_size_t alignment)
209{
210	int err;
211	device_t dev = sc->dev;
212	bus_size_t boundary, maxsegsize;
213
214	if (bytes > 4096 && alignment == 4096) {
215		boundary = 0;
216		maxsegsize = bytes;
217	} else {
218		boundary = 4096;
219		maxsegsize = 4096;
220	}
221
222	/* allocate DMAable memory tags */
223	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
224				 alignment,		/* alignment */
225				 boundary,		/* boundary */
226				 BUS_SPACE_MAXADDR,	/* low */
227				 BUS_SPACE_MAXADDR,	/* high */
228				 NULL, NULL,		/* filter */
229				 bytes,			/* maxsize */
230				 1,			/* num segs */
231				 maxsegsize,		/* maxsegsize */
232				 BUS_DMA_COHERENT,	/* flags */
233				 NULL, NULL,		/* lock */
234				 &dma->dmat);		/* tag */
235	if (err != 0) {
236		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
237		return err;
238	}
239
240	/* allocate DMAable memory & map */
241	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
242			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
243				| BUS_DMA_ZERO),  &dma->map);
244	if (err != 0) {
245		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
246		goto abort_with_dmat;
247	}
248
249	/* load the memory */
250	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
251			      mxge_dmamap_callback,
252			      (void *)&dma->bus_addr, 0);
253	if (err != 0) {
254		device_printf(dev, "couldn't load map (err = %d)\n", err);
255		goto abort_with_mem;
256	}
257	return 0;
258
259abort_with_mem:
260	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
261abort_with_dmat:
262	(void)bus_dma_tag_destroy(dma->dmat);
263	return err;
264}
265
266static void
267mxge_dma_free(mxge_dma_t *dma)
268{
269	bus_dmamap_unload(dma->dmat, dma->map);
270	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
271	(void)bus_dma_tag_destroy(dma->dmat);
272}
273
274/*
275 * The eeprom strings on the lanaiX have the format
276 * SN=x\0
277 * MAC=x:x:x:x:x:x\0
278 * PC=text\0
279 */
280
281static int
282mxge_parse_strings(mxge_softc_t *sc)
283{
284	char *ptr;
285	int i, found_mac, found_sn2;
286	char *endptr;
287
288	ptr = sc->eeprom_strings;
289	found_mac = 0;
290	found_sn2 = 0;
291	while (*ptr != '\0') {
292		if (strncmp(ptr, "MAC=", 4) == 0) {
293			ptr += 4;
294			for (i = 0;;) {
295				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
296				if (endptr - ptr != 2)
297					goto abort;
298				ptr = endptr;
299				if (++i == 6)
300					break;
301				if (*ptr++ != ':')
302					goto abort;
303			}
304			found_mac = 1;
305		} else if (strncmp(ptr, "PC=", 3) == 0) {
306			ptr += 3;
307			strlcpy(sc->product_code_string, ptr,
308			    sizeof(sc->product_code_string));
309		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
310			ptr += 3;
311			strlcpy(sc->serial_number_string, ptr,
312			    sizeof(sc->serial_number_string));
313		} else if (strncmp(ptr, "SN2=", 4) == 0) {
314			/* SN2 takes precedence over SN */
315			ptr += 4;
316			found_sn2 = 1;
317			strlcpy(sc->serial_number_string, ptr,
318			    sizeof(sc->serial_number_string));
319		}
320		while (*ptr++ != '\0') {}
321	}
322
323	if (found_mac)
324		return 0;
325
326 abort:
327	device_printf(sc->dev, "failed to parse eeprom_strings\n");
328
329	return ENXIO;
330}
331
332#if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
333static void
334mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
335{
336	uint32_t val;
337	unsigned long base, off;
338	char *va, *cfgptr;
339	device_t pdev, mcp55;
340	uint16_t vendor_id, device_id, word;
341	uintptr_t bus, slot, func, ivend, idev;
342	uint32_t *ptr32;
343
344	if (!mxge_nvidia_ecrc_enable)
345		return;
346
347	pdev = device_get_parent(device_get_parent(sc->dev));
348	if (pdev == NULL) {
349		device_printf(sc->dev, "could not find parent?\n");
350		return;
351	}
352	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
353	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
354
355	if (vendor_id != 0x10de)
356		return;
357
358	base = 0;
359
360	if (device_id == 0x005d) {
361		/* ck804, base address is magic */
362		base = 0xe0000000UL;
363	} else if (device_id >= 0x0374 && device_id <= 0x378) {
364		/* mcp55, base address stored in chipset */
365		mcp55 = pci_find_bsf(0, 0, 0);
366		if (mcp55 &&
367		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
368		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
369			word = pci_read_config(mcp55, 0x90, 2);
370			base = ((unsigned long)word & 0x7ffeU) << 25;
371		}
372	}
373	if (!base)
374		return;
375
376	/* XXXX
377	   Test below is commented because it is believed that doing
378	   config read/write beyond 0xff will access the config space
379	   for the next larger function.  Uncomment this and remove
380	   the hacky pmap_mapdev() way of accessing config space when
381	   FreeBSD grows support for extended pcie config space access
382	*/
383#if 0
384	/* See if we can, by some miracle, access the extended
385	   config space */
386	val = pci_read_config(pdev, 0x178, 4);
387	if (val != 0xffffffff) {
388		val |= 0x40;
389		pci_write_config(pdev, 0x178, val, 4);
390		return;
391	}
392#endif
393	/* Rather than using normal pci config space writes, we must
394	 * map the Nvidia config space ourselves.  This is because on
395	 * opteron/nvidia class machine the 0xe000000 mapping is
396	 * handled by the nvidia chipset, that means the internal PCI
397	 * device (the on-chip northbridge), or the amd-8131 bridge
398	 * and things behind them are not visible by this method.
399	 */
400
401	BUS_READ_IVAR(device_get_parent(pdev), pdev,
402		      PCI_IVAR_BUS, &bus);
403	BUS_READ_IVAR(device_get_parent(pdev), pdev,
404		      PCI_IVAR_SLOT, &slot);
405	BUS_READ_IVAR(device_get_parent(pdev), pdev,
406		      PCI_IVAR_FUNCTION, &func);
407	BUS_READ_IVAR(device_get_parent(pdev), pdev,
408		      PCI_IVAR_VENDOR, &ivend);
409	BUS_READ_IVAR(device_get_parent(pdev), pdev,
410		      PCI_IVAR_DEVICE, &idev);
411
412	off =  base
413		+ 0x00100000UL * (unsigned long)bus
414		+ 0x00001000UL * (unsigned long)(func
415						 + 8 * slot);
416
417	/* map it into the kernel */
418	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
419
420	if (va == NULL) {
421		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
422		return;
423	}
424	/* get a pointer to the config space mapped into the kernel */
425	cfgptr = va + (off & PAGE_MASK);
426
427	/* make sure that we can really access it */
428	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
429	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
430	if (! (vendor_id == ivend && device_id == idev)) {
431		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
432			      vendor_id, device_id);
433		pmap_unmapdev(va, PAGE_SIZE);
434		return;
435	}
436
437	ptr32 = (uint32_t*)(cfgptr + 0x178);
438	val = *ptr32;
439
440	if (val == 0xffffffff) {
441		device_printf(sc->dev, "extended mapping failed\n");
442		pmap_unmapdev(va, PAGE_SIZE);
443		return;
444	}
445	*ptr32 = val | 0x40;
446	pmap_unmapdev(va, PAGE_SIZE);
447	if (mxge_verbose)
448		device_printf(sc->dev,
449			      "Enabled ECRC on upstream Nvidia bridge "
450			      "at %d:%d:%d\n",
451			      (int)bus, (int)slot, (int)func);
452	return;
453}
454#else
455static void
456mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
457{
458	device_printf(sc->dev,
459		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
460	return;
461}
462#endif
463
464static int
465mxge_dma_test(mxge_softc_t *sc, int test_type)
466{
467	mxge_cmd_t cmd;
468	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
469	int status;
470	uint32_t len;
471	char *test = " ";
472
473	/* Run a small DMA test.
474	 * The magic multipliers to the length tell the firmware
475	 * to do DMA read, write, or read+write tests.  The
476	 * results are returned in cmd.data0.  The upper 16
477	 * bits of the return is the number of transfers completed.
478	 * The lower 16 bits is the time in 0.5us ticks that the
479	 * transfers took to complete.
480	 */
481
482	len = sc->tx_boundary;
483
484	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
485	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
486	cmd.data2 = len * 0x10000;
487	status = mxge_send_cmd(sc, test_type, &cmd);
488	if (status != 0) {
489		test = "read";
490		goto abort;
491	}
492	sc->read_dma = ((cmd.data0>>16) * len * 2) /
493		(cmd.data0 & 0xffff);
494	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
495	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
496	cmd.data2 = len * 0x1;
497	status = mxge_send_cmd(sc, test_type, &cmd);
498	if (status != 0) {
499		test = "write";
500		goto abort;
501	}
502	sc->write_dma = ((cmd.data0>>16) * len * 2) /
503		(cmd.data0 & 0xffff);
504
505	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
506	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
507	cmd.data2 = len * 0x10001;
508	status = mxge_send_cmd(sc, test_type, &cmd);
509	if (status != 0) {
510		test = "read/write";
511		goto abort;
512	}
513	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
514		(cmd.data0 & 0xffff);
515
516abort:
517	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
518		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
519			      test, status);
520
521	return status;
522}
523
524/*
525 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
526 * when the PCI-E Completion packets are aligned on an 8-byte
527 * boundary.  Some PCI-E chip sets always align Completion packets; on
528 * the ones that do not, the alignment can be enforced by enabling
529 * ECRC generation (if supported).
530 *
531 * When PCI-E Completion packets are not aligned, it is actually more
532 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
533 *
534 * If the driver can neither enable ECRC nor verify that it has
535 * already been enabled, then it must use a firmware image which works
536 * around unaligned completion packets (ethp_z8e.dat), and it should
537 * also ensure that it never gives the device a Read-DMA which is
538 * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
539 * enabled, then the driver should use the aligned (eth_z8e.dat)
540 * firmware image, and set tx_boundary to 4KB.
541 */
542
543static int
544mxge_firmware_probe(mxge_softc_t *sc)
545{
546	device_t dev = sc->dev;
547	int reg, status;
548	uint16_t pectl;
549
550	sc->tx_boundary = 4096;
551	/*
552	 * Verify the max read request size was set to 4KB
553	 * before trying the test with 4KB.
554	 */
555	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
556		pectl = pci_read_config(dev, reg + 0x8, 2);
557		if ((pectl & (5 << 12)) != (5 << 12)) {
558			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
559				      pectl);
560			sc->tx_boundary = 2048;
561		}
562	}
563
564	/*
565	 * load the optimized firmware (which assumes aligned PCIe
566	 * completions) in order to see if it works on this host.
567	 */
568	sc->fw_name = mxge_fw_aligned;
569	status = mxge_load_firmware(sc, 1);
570	if (status != 0) {
571		return status;
572	}
573
574	/*
575	 * Enable ECRC if possible
576	 */
577	mxge_enable_nvidia_ecrc(sc);
578
579	/*
580	 * Run a DMA test which watches for unaligned completions and
581	 * aborts on the first one seen.  Not required on Z8ES or newer.
582	 */
583	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
584		return 0;
585	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
586	if (status == 0)
587		return 0; /* keep the aligned firmware */
588
589	if (status != E2BIG)
590		device_printf(dev, "DMA test failed: %d\n", status);
591	if (status == ENOSYS)
592		device_printf(dev, "Falling back to ethp! "
593			      "Please install up to date fw\n");
594	return status;
595}
596
597static int
598mxge_select_firmware(mxge_softc_t *sc)
599{
600	int aligned = 0;
601	int force_firmware = mxge_force_firmware;
602
603	if (sc->throttle)
604		force_firmware = sc->throttle;
605
606	if (force_firmware != 0) {
607		if (force_firmware == 1)
608			aligned = 1;
609		else
610			aligned = 0;
611		if (mxge_verbose)
612			device_printf(sc->dev,
613				      "Assuming %s completions (forced)\n",
614				      aligned ? "aligned" : "unaligned");
615		goto abort;
616	}
617
618	/* if the PCIe link width is 4 or less, we can use the aligned
619	   firmware and skip any checks */
620	if (sc->link_width != 0 && sc->link_width <= 4) {
621		device_printf(sc->dev,
622			      "PCIe x%d Link, expect reduced performance\n",
623			      sc->link_width);
624		aligned = 1;
625		goto abort;
626	}
627
628	if (0 == mxge_firmware_probe(sc))
629		return 0;
630
631abort:
632	if (aligned) {
633		sc->fw_name = mxge_fw_aligned;
634		sc->tx_boundary = 4096;
635	} else {
636		sc->fw_name = mxge_fw_unaligned;
637		sc->tx_boundary = 2048;
638	}
639	return (mxge_load_firmware(sc, 0));
640}
641
642static int
643mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
644{
645
646	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
647		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
648			      be32toh(hdr->mcp_type));
649		return EIO;
650	}
651
652	/* save firmware version for sysctl */
653	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
654	if (mxge_verbose)
655		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
656
657	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
658	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
659
660	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
661	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
662		device_printf(sc->dev, "Found firmware version %s\n",
663			      sc->fw_version);
664		device_printf(sc->dev, "Driver needs %d.%d\n",
665			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
666		return EINVAL;
667	}
668	return 0;
669
670}
671
672static int
673mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
674{
675	z_stream zs;
676	char *inflate_buffer;
677	const struct firmware *fw;
678	const mcp_gen_header_t *hdr;
679	unsigned hdr_offset;
680	int status;
681	unsigned int i;
682	size_t fw_len;
683
684	fw = firmware_get(sc->fw_name);
685	if (fw == NULL) {
686		device_printf(sc->dev, "Could not find firmware image %s\n",
687			      sc->fw_name);
688		return ENOENT;
689	}
690
691	/* setup zlib and decompress f/w */
692	bzero(&zs, sizeof (zs));
693	zs.zalloc = zcalloc_nowait;
694	zs.zfree = zcfree;
695	status = inflateInit(&zs);
696	if (status != Z_OK) {
697		status = EIO;
698		goto abort_with_fw;
699	}
700
701	/* the uncompressed size is stored as the firmware version,
702	   which would otherwise go unused */
703	fw_len = (size_t) fw->version;
704	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
705	if (inflate_buffer == NULL)
706		goto abort_with_zs;
707	zs.avail_in = fw->datasize;
708	zs.next_in = __DECONST(char *, fw->data);
709	zs.avail_out = fw_len;
710	zs.next_out = inflate_buffer;
711	status = inflate(&zs, Z_FINISH);
712	if (status != Z_STREAM_END) {
713		device_printf(sc->dev, "zlib %d\n", status);
714		status = EIO;
715		goto abort_with_buffer;
716	}
717
718	/* check id */
719	hdr_offset = htobe32(*(const uint32_t *)
720			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
721	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
722		device_printf(sc->dev, "Bad firmware file");
723		status = EIO;
724		goto abort_with_buffer;
725	}
726	hdr = (const void*)(inflate_buffer + hdr_offset);
727
728	status = mxge_validate_firmware(sc, hdr);
729	if (status != 0)
730		goto abort_with_buffer;
731
732	/* Copy the inflated firmware to NIC SRAM. */
733	for (i = 0; i < fw_len; i += 256) {
734		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
735			      inflate_buffer + i,
736			      min(256U, (unsigned)(fw_len - i)));
737		wmb();
738		(void)*sc->sram;
739		wmb();
740	}
741
742	*limit = fw_len;
743	status = 0;
744abort_with_buffer:
745	free(inflate_buffer, M_TEMP);
746abort_with_zs:
747	inflateEnd(&zs);
748abort_with_fw:
749	firmware_put(fw, FIRMWARE_UNLOAD);
750	return status;
751}
752
753/*
754 * Enable or disable periodic RDMAs from the host to make certain
755 * chipsets resend dropped PCIe messages
756 */
757
758static void
759mxge_dummy_rdma(mxge_softc_t *sc, int enable)
760{
761	char buf_bytes[72];
762	volatile uint32_t *confirm;
763	volatile char *submit;
764	uint32_t *buf, dma_low, dma_high;
765	int i;
766
767	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
768
769	/* clear confirmation addr */
770	confirm = (volatile uint32_t *)sc->cmd;
771	*confirm = 0;
772	wmb();
773
774	/* send an rdma command to the PCIe engine, and wait for the
775	   response in the confirmation address.  The firmware should
776	   write a -1 there to indicate it is alive and well
777	*/
778
779	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
780	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
781	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
782	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
783	buf[2] = htobe32(0xffffffff);		/* confirm data */
784	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
785	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
786	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
787	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
788	buf[5] = htobe32(enable);			/* enable? */
789
790	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
791
792	mxge_pio_copy(submit, buf, 64);
793	wmb();
794	DELAY(1000);
795	wmb();
796	i = 0;
797	while (*confirm != 0xffffffff && i < 20) {
798		DELAY(1000);
799		i++;
800	}
801	if (*confirm != 0xffffffff) {
802		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
803			      (enable ? "enable" : "disable"), confirm,
804			      *confirm);
805	}
806	return;
807}
808
809static int
810mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
811{
812	mcp_cmd_t *buf;
813	char buf_bytes[sizeof(*buf) + 8];
814	volatile mcp_cmd_response_t *response = sc->cmd;
815	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
816	uint32_t dma_low, dma_high;
817	int err, sleep_total = 0;
818
819	/* ensure buf is aligned to 8 bytes */
820	buf = (mcp_cmd_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
821
822	buf->data0 = htobe32(data->data0);
823	buf->data1 = htobe32(data->data1);
824	buf->data2 = htobe32(data->data2);
825	buf->cmd = htobe32(cmd);
826	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
827	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
828
829	buf->response_addr.low = htobe32(dma_low);
830	buf->response_addr.high = htobe32(dma_high);
831	mtx_lock(&sc->cmd_mtx);
832	response->result = 0xffffffff;
833	wmb();
834	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
835
836	/* wait up to 20ms */
837	err = EAGAIN;
838	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
839		bus_dmamap_sync(sc->cmd_dma.dmat,
840				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
841		wmb();
842		switch (be32toh(response->result)) {
843		case 0:
844			data->data0 = be32toh(response->data);
845			err = 0;
846			break;
847		case 0xffffffff:
848			DELAY(1000);
849			break;
850		case MXGEFW_CMD_UNKNOWN:
851			err = ENOSYS;
852			break;
853		case MXGEFW_CMD_ERROR_UNALIGNED:
854			err = E2BIG;
855			break;
856		case MXGEFW_CMD_ERROR_BUSY:
857			err = EBUSY;
858			break;
859		case MXGEFW_CMD_ERROR_I2C_ABSENT:
860			err = ENXIO;
861			break;
862		default:
863			device_printf(sc->dev,
864				      "mxge: command %d "
865				      "failed, result = %d\n",
866				      cmd, be32toh(response->result));
867			err = ENXIO;
868			break;
869		}
870		if (err != EAGAIN)
871			break;
872	}
873	if (err == EAGAIN)
874		device_printf(sc->dev, "mxge: command %d timed out"
875			      "result = %d\n",
876			      cmd, be32toh(response->result));
877	mtx_unlock(&sc->cmd_mtx);
878	return err;
879}
880
881static int
882mxge_adopt_running_firmware(mxge_softc_t *sc)
883{
884	struct mcp_gen_header *hdr;
885	const size_t bytes = sizeof (struct mcp_gen_header);
886	size_t hdr_offset;
887	int status;
888
889	/* find running firmware header */
890	hdr_offset = htobe32(*(volatile uint32_t *)
891			     (sc->sram + MCP_HEADER_PTR_OFFSET));
892
893	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
894		device_printf(sc->dev,
895			      "Running firmware has bad header offset (%d)\n",
896			      (int)hdr_offset);
897		return EIO;
898	}
899
900	/* copy header of running firmware from SRAM to host memory to
901	 * validate firmware */
902	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
903	if (hdr == NULL) {
904		device_printf(sc->dev, "could not malloc firmware hdr\n");
905		return ENOMEM;
906	}
907	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
908				rman_get_bushandle(sc->mem_res),
909				hdr_offset, (char *)hdr, bytes);
910	status = mxge_validate_firmware(sc, hdr);
911	free(hdr, M_DEVBUF);
912
913	/*
914	 * check to see if adopted firmware has bug where adopting
915	 * it will cause broadcasts to be filtered unless the NIC
916	 * is kept in ALLMULTI mode
917	 */
918	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
919	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
920		sc->adopted_rx_filter_bug = 1;
921		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
922			      "working around rx filter bug\n",
923			      sc->fw_ver_major, sc->fw_ver_minor,
924			      sc->fw_ver_tiny);
925	}
926
927	return status;
928}
929
930static int
931mxge_load_firmware(mxge_softc_t *sc, int adopt)
932{
933	volatile uint32_t *confirm;
934	volatile char *submit;
935	char buf_bytes[72];
936	uint32_t *buf, size, dma_low, dma_high;
937	int status, i;
938
939	buf = (uint32_t *)((uintptr_t)(buf_bytes + 7) & ~7UL);
940
941	size = sc->sram_size;
942	status = mxge_load_firmware_helper(sc, &size);
943	if (status) {
944		if (!adopt)
945			return status;
946		/* Try to use the currently running firmware, if
947		   it is new enough */
948		status = mxge_adopt_running_firmware(sc);
949		if (status) {
950			device_printf(sc->dev,
951				      "failed to adopt running firmware\n");
952			return status;
953		}
954		device_printf(sc->dev,
955			      "Successfully adopted running firmware\n");
956		if (sc->tx_boundary == 4096) {
957			device_printf(sc->dev,
958				"Using firmware currently running on NIC"
959				 ".  For optimal\n");
960			device_printf(sc->dev,
961				 "performance consider loading optimized "
962				 "firmware\n");
963		}
964		sc->fw_name = mxge_fw_unaligned;
965		sc->tx_boundary = 2048;
966		return 0;
967	}
968	/* clear confirmation addr */
969	confirm = (volatile uint32_t *)sc->cmd;
970	*confirm = 0;
971	wmb();
972	/* send a reload command to the bootstrap MCP, and wait for the
973	   response in the confirmation address.  The firmware should
974	   write a -1 there to indicate it is alive and well
975	*/
976
977	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
978	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
979
980	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
981	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
982	buf[2] = htobe32(0xffffffff);	/* confirm data */
983
984	/* FIX: All newest firmware should un-protect the bottom of
985	   the sram before handoff. However, the very first interfaces
986	   do not. Therefore the handoff copy must skip the first 8 bytes
987	*/
988					/* where the code starts*/
989	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
990	buf[4] = htobe32(size - 8); 	/* length of code */
991	buf[5] = htobe32(8);		/* where to copy to */
992	buf[6] = htobe32(0);		/* where to jump to */
993
994	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
995	mxge_pio_copy(submit, buf, 64);
996	wmb();
997	DELAY(1000);
998	wmb();
999	i = 0;
1000	while (*confirm != 0xffffffff && i < 20) {
1001		DELAY(1000*10);
1002		i++;
1003		bus_dmamap_sync(sc->cmd_dma.dmat,
1004				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1005	}
1006	if (*confirm != 0xffffffff) {
1007		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1008			confirm, *confirm);
1009
1010		return ENXIO;
1011	}
1012	return 0;
1013}
1014
1015static int
1016mxge_update_mac_address(mxge_softc_t *sc)
1017{
1018	mxge_cmd_t cmd;
1019	uint8_t *addr = sc->mac_addr;
1020	int status;
1021
1022	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1023		     | (addr[2] << 8) | addr[3]);
1024
1025	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1026
1027	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1028	return status;
1029}
1030
1031static int
1032mxge_change_pause(mxge_softc_t *sc, int pause)
1033{
1034	mxge_cmd_t cmd;
1035	int status;
1036
1037	if (pause)
1038		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1039				       &cmd);
1040	else
1041		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1042				       &cmd);
1043
1044	if (status) {
1045		device_printf(sc->dev, "Failed to set flow control mode\n");
1046		return ENXIO;
1047	}
1048	sc->pause = pause;
1049	return 0;
1050}
1051
1052static void
1053mxge_change_promisc(mxge_softc_t *sc, int promisc)
1054{
1055	mxge_cmd_t cmd;
1056	int status;
1057
1058	if (mxge_always_promisc)
1059		promisc = 1;
1060
1061	if (promisc)
1062		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1063				       &cmd);
1064	else
1065		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1066				       &cmd);
1067
1068	if (status) {
1069		device_printf(sc->dev, "Failed to set promisc mode\n");
1070	}
1071}
1072
1073struct mxge_add_maddr_ctx {
1074	mxge_softc_t *sc;
1075	int error;
1076};
1077
1078static u_int
1079mxge_add_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt)
1080{
1081	struct mxge_add_maddr_ctx *ctx = arg;
1082	mxge_cmd_t cmd;
1083
1084	if (ctx->error != 0)
1085		return (0);
1086	bcopy(LLADDR(sdl), &cmd.data0, 4);
1087	bcopy(LLADDR(sdl) + 4, &cmd.data1, 2);
1088	cmd.data0 = htonl(cmd.data0);
1089	cmd.data1 = htonl(cmd.data1);
1090
1091	ctx->error = mxge_send_cmd(ctx->sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1092
1093	return (1);
1094}
1095
1096static void
1097mxge_set_multicast_list(mxge_softc_t *sc)
1098{
1099	struct mxge_add_maddr_ctx ctx;
1100	if_t ifp = sc->ifp;
1101	mxge_cmd_t cmd;
1102	int err;
1103
1104	/* This firmware is known to not support multicast */
1105	if (!sc->fw_multicast_support)
1106		return;
1107
1108	/* Disable multicast filtering while we play with the lists*/
1109	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1110	if (err != 0) {
1111		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1112		       " error status: %d\n", err);
1113		return;
1114	}
1115
1116	if (sc->adopted_rx_filter_bug)
1117		return;
1118
1119	if (if_getflags(ifp) & IFF_ALLMULTI)
1120		/* request to disable multicast filtering, so quit here */
1121		return;
1122
1123	/* Flush all the filters */
1124
1125	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1126	if (err != 0) {
1127		device_printf(sc->dev,
1128			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1129			      ", error status: %d\n", err);
1130		return;
1131	}
1132
1133	/* Walk the multicast list, and add each address */
1134	ctx.sc = sc;
1135	ctx.error = 0;
1136	if_foreach_llmaddr(ifp, mxge_add_maddr, &ctx);
1137	if (ctx.error != 0) {
1138		device_printf(sc->dev, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1139		    "error status:" "%d\t", ctx.error);
1140		/* abort, leaving multicast filtering off */
1141		return;
1142	}
1143
1144	/* Enable multicast filtering */
1145	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1146	if (err != 0) {
1147		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1148		       ", error status: %d\n", err);
1149	}
1150}
1151
1152static int
1153mxge_max_mtu(mxge_softc_t *sc)
1154{
1155	mxge_cmd_t cmd;
1156	int status;
1157
1158	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1159		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1160
1161	/* try to set nbufs to see if it we can
1162	   use virtually contiguous jumbos */
1163	cmd.data0 = 0;
1164	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1165			       &cmd);
1166	if (status == 0)
1167		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1168
1169	/* otherwise, we're limited to MJUMPAGESIZE */
1170	return MJUMPAGESIZE - MXGEFW_PAD;
1171}
1172
1173static int
1174mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1175{
1176	struct mxge_slice_state *ss;
1177	mxge_rx_done_t *rx_done;
1178	volatile uint32_t *irq_claim;
1179	mxge_cmd_t cmd;
1180	int slice, status;
1181
1182	/* try to send a reset command to the card to see if it
1183	   is alive */
1184	memset(&cmd, 0, sizeof (cmd));
1185	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1186	if (status != 0) {
1187		device_printf(sc->dev, "failed reset\n");
1188		return ENXIO;
1189	}
1190
1191	mxge_dummy_rdma(sc, 1);
1192
1193	/* set the intrq size */
1194	cmd.data0 = sc->rx_ring_size;
1195	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1196
1197	/*
1198	 * Even though we already know how many slices are supported
1199	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1200	 * has magic side effects, and must be called after a reset.
1201	 * It must be called prior to calling any RSS related cmds,
1202	 * including assigning an interrupt queue for anything but
1203	 * slice 0.  It must also be called *after*
1204	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1205	 * the firmware to compute offsets.
1206	 */
1207
1208	if (sc->num_slices > 1) {
1209		/* ask the maximum number of slices it supports */
1210		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1211					   &cmd);
1212		if (status != 0) {
1213			device_printf(sc->dev,
1214				      "failed to get number of slices\n");
1215			return status;
1216		}
1217		/*
1218		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1219		 * to setting up the interrupt queue DMA
1220		 */
1221		cmd.data0 = sc->num_slices;
1222		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1223		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1224		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1225					   &cmd);
1226		if (status != 0) {
1227			device_printf(sc->dev,
1228				      "failed to set number of slices\n");
1229			return status;
1230		}
1231	}
1232
1233	if (interrupts_setup) {
1234		/* Now exchange information about interrupts  */
1235		for (slice = 0; slice < sc->num_slices; slice++) {
1236			rx_done = &sc->ss[slice].rx_done;
1237			memset(rx_done->entry, 0, sc->rx_ring_size);
1238			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1239			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1240			cmd.data2 = slice;
1241			status |= mxge_send_cmd(sc,
1242						MXGEFW_CMD_SET_INTRQ_DMA,
1243						&cmd);
1244		}
1245	}
1246
1247	status |= mxge_send_cmd(sc,
1248				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1249
1250	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1251
1252	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1253	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1254
1255	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1256				&cmd);
1257	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1258	if (status != 0) {
1259		device_printf(sc->dev, "failed set interrupt parameters\n");
1260		return status;
1261	}
1262
1263	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1264
1265	/* run a DMA benchmark */
1266	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1267
1268	for (slice = 0; slice < sc->num_slices; slice++) {
1269		ss = &sc->ss[slice];
1270
1271		ss->irq_claim = irq_claim + (2 * slice);
1272		/* reset mcp/driver shared state back to 0 */
1273		ss->rx_done.idx = 0;
1274		ss->rx_done.cnt = 0;
1275		ss->tx.req = 0;
1276		ss->tx.done = 0;
1277		ss->tx.pkt_done = 0;
1278		ss->tx.queue_active = 0;
1279		ss->tx.activate = 0;
1280		ss->tx.deactivate = 0;
1281		ss->tx.wake = 0;
1282		ss->tx.defrag = 0;
1283		ss->tx.stall = 0;
1284		ss->rx_big.cnt = 0;
1285		ss->rx_small.cnt = 0;
1286		ss->lc.lro_bad_csum = 0;
1287		ss->lc.lro_queued = 0;
1288		ss->lc.lro_flushed = 0;
1289		if (ss->fw_stats != NULL) {
1290			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1291		}
1292	}
1293	sc->rdma_tags_available = 15;
1294	status = mxge_update_mac_address(sc);
1295	mxge_change_promisc(sc, if_getflags(sc->ifp) & IFF_PROMISC);
1296	mxge_change_pause(sc, sc->pause);
1297	mxge_set_multicast_list(sc);
1298	if (sc->throttle) {
1299		cmd.data0 = sc->throttle;
1300		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1301				  &cmd)) {
1302			device_printf(sc->dev,
1303				      "can't enable throttle\n");
1304		}
1305	}
1306	return status;
1307}
1308
1309static int
1310mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1311{
1312	mxge_cmd_t cmd;
1313	mxge_softc_t *sc;
1314	int err;
1315	unsigned int throttle;
1316
1317	sc = arg1;
1318	throttle = sc->throttle;
1319	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1320	if (err != 0) {
1321		return err;
1322	}
1323
1324	if (throttle == sc->throttle)
1325		return 0;
1326
1327	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1328		return EINVAL;
1329
1330	mtx_lock(&sc->driver_mtx);
1331	cmd.data0 = throttle;
1332	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1333	if (err == 0)
1334		sc->throttle = throttle;
1335	mtx_unlock(&sc->driver_mtx);
1336	return err;
1337}
1338
1339static int
1340mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1341{
1342	mxge_softc_t *sc;
1343	unsigned int intr_coal_delay;
1344	int err;
1345
1346	sc = arg1;
1347	intr_coal_delay = sc->intr_coal_delay;
1348	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1349	if (err != 0) {
1350		return err;
1351	}
1352	if (intr_coal_delay == sc->intr_coal_delay)
1353		return 0;
1354
1355	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1356		return EINVAL;
1357
1358	mtx_lock(&sc->driver_mtx);
1359	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1360	sc->intr_coal_delay = intr_coal_delay;
1361
1362	mtx_unlock(&sc->driver_mtx);
1363	return err;
1364}
1365
1366static int
1367mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1368{
1369	mxge_softc_t *sc;
1370	unsigned int enabled;
1371	int err;
1372
1373	sc = arg1;
1374	enabled = sc->pause;
1375	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1376	if (err != 0) {
1377		return err;
1378	}
1379	if (enabled == sc->pause)
1380		return 0;
1381
1382	mtx_lock(&sc->driver_mtx);
1383	err = mxge_change_pause(sc, enabled);
1384	mtx_unlock(&sc->driver_mtx);
1385	return err;
1386}
1387
1388static int
1389mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1390{
1391	int err;
1392
1393	if (arg1 == NULL)
1394		return EFAULT;
1395	arg2 = be32toh(*(int *)arg1);
1396	arg1 = NULL;
1397	err = sysctl_handle_int(oidp, arg1, arg2, req);
1398
1399	return err;
1400}
1401
1402static void
1403mxge_rem_sysctls(mxge_softc_t *sc)
1404{
1405	struct mxge_slice_state *ss;
1406	int slice;
1407
1408	if (sc->slice_sysctl_tree == NULL)
1409		return;
1410
1411	for (slice = 0; slice < sc->num_slices; slice++) {
1412		ss = &sc->ss[slice];
1413		if (ss == NULL || ss->sysctl_tree == NULL)
1414			continue;
1415		sysctl_ctx_free(&ss->sysctl_ctx);
1416		ss->sysctl_tree = NULL;
1417	}
1418	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1419	sc->slice_sysctl_tree = NULL;
1420}
1421
1422static void
1423mxge_add_sysctls(mxge_softc_t *sc)
1424{
1425	struct sysctl_ctx_list *ctx;
1426	struct sysctl_oid_list *children;
1427	mcp_irq_data_t *fw;
1428	struct mxge_slice_state *ss;
1429	int slice;
1430	char slice_num[8];
1431
1432	ctx = device_get_sysctl_ctx(sc->dev);
1433	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1434	fw = sc->ss[0].fw_stats;
1435
1436	/* random information */
1437	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1438		       "firmware_version",
1439		       CTLFLAG_RD, sc->fw_version,
1440		       0, "firmware version");
1441	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1442		       "serial_number",
1443		       CTLFLAG_RD, sc->serial_number_string,
1444		       0, "serial number");
1445	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1446		       "product_code",
1447		       CTLFLAG_RD, sc->product_code_string,
1448		       0, "product_code");
1449	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1450		       "pcie_link_width",
1451		       CTLFLAG_RD, &sc->link_width,
1452		       0, "tx_boundary");
1453	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1454		       "tx_boundary",
1455		       CTLFLAG_RD, &sc->tx_boundary,
1456		       0, "tx_boundary");
1457	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1458		       "write_combine",
1459		       CTLFLAG_RD, &sc->wc,
1460		       0, "write combining PIO?");
1461	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1462		       "read_dma_MBs",
1463		       CTLFLAG_RD, &sc->read_dma,
1464		       0, "DMA Read speed in MB/s");
1465	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1466		       "write_dma_MBs",
1467		       CTLFLAG_RD, &sc->write_dma,
1468		       0, "DMA Write speed in MB/s");
1469	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1470		       "read_write_dma_MBs",
1471		       CTLFLAG_RD, &sc->read_write_dma,
1472		       0, "DMA concurrent Read/Write speed in MB/s");
1473	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1474		       "watchdog_resets",
1475		       CTLFLAG_RD, &sc->watchdog_resets,
1476		       0, "Number of times NIC was reset");
1477
1478	/* performance related tunables */
1479	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1480	    "intr_coal_delay", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1481	    sc, 0, mxge_change_intr_coal, "I",
1482	    "interrupt coalescing delay in usecs");
1483
1484	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1485	    "throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1486	    mxge_change_throttle, "I", "transmit throttling");
1487
1488	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1489	    "flow_control_enabled",
1490	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1491	    mxge_change_flow_control, "I",
1492	    "interrupt coalescing delay in usecs");
1493
1494	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1495		       "deassert_wait",
1496		       CTLFLAG_RW, &mxge_deassert_wait,
1497		       0, "Wait for IRQ line to go low in ihandler");
1498
1499	/* stats block from firmware is in network byte order.
1500	   Need to swap it */
1501	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1502	    "link_up", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1503	    &fw->link_up, 0, mxge_handle_be32, "I", "link up");
1504	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1505	    "rdma_tags_available", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1506	    &fw->rdma_tags_available, 0, mxge_handle_be32, "I",
1507	    "rdma_tags_available");
1508	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1509	    "dropped_bad_crc32", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1510	    &fw->dropped_bad_crc32, 0, mxge_handle_be32, "I",
1511	    "dropped_bad_crc32");
1512	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1513	    "dropped_bad_phy", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1514	    &fw->dropped_bad_phy, 0, mxge_handle_be32, "I", "dropped_bad_phy");
1515	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1516	    "dropped_link_error_or_filtered",
1517	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1518	    &fw->dropped_link_error_or_filtered, 0, mxge_handle_be32, "I",
1519	    "dropped_link_error_or_filtered");
1520	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1521	    "dropped_link_overflow",
1522	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1523	    &fw->dropped_link_overflow, 0, mxge_handle_be32, "I",
1524	    "dropped_link_overflow");
1525	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1526	    "dropped_multicast_filtered",
1527	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1528	    &fw->dropped_multicast_filtered, 0, mxge_handle_be32, "I",
1529	    "dropped_multicast_filtered");
1530	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1531	    "dropped_no_big_buffer",
1532	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1533	    &fw->dropped_no_big_buffer, 0, mxge_handle_be32, "I",
1534	    "dropped_no_big_buffer");
1535	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1536	    "dropped_no_small_buffer",
1537	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1538	    &fw->dropped_no_small_buffer, 0, mxge_handle_be32, "I",
1539	    "dropped_no_small_buffer");
1540	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1541	    "dropped_overrun",
1542	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1543	    &fw->dropped_overrun, 0, mxge_handle_be32, "I",
1544	    "dropped_overrun");
1545	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1546	    "dropped_pause", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1547	    &fw->dropped_pause, 0, mxge_handle_be32, "I", "dropped_pause");
1548	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549	    "dropped_runt", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1550	    &fw->dropped_runt, 0, mxge_handle_be32, "I", "dropped_runt");
1551
1552	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1553	    "dropped_unicast_filtered",
1554	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
1555	    &fw->dropped_unicast_filtered, 0, mxge_handle_be32, "I",
1556	    "dropped_unicast_filtered");
1557
1558	/* verbose printing? */
1559	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1560		       "verbose",
1561		       CTLFLAG_RW, &mxge_verbose,
1562		       0, "verbose printing");
1563
1564	/* add counters exported for debugging from all slices */
1565	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1566	sc->slice_sysctl_tree =
1567		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1568		    "slice", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1569
1570	for (slice = 0; slice < sc->num_slices; slice++) {
1571		ss = &sc->ss[slice];
1572		sysctl_ctx_init(&ss->sysctl_ctx);
1573		ctx = &ss->sysctl_ctx;
1574		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1575		sprintf(slice_num, "%d", slice);
1576		ss->sysctl_tree =
1577			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1578			    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1579		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1580		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1581			       "rx_small_cnt",
1582			       CTLFLAG_RD, &ss->rx_small.cnt,
1583			       0, "rx_small_cnt");
1584		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1585			       "rx_big_cnt",
1586			       CTLFLAG_RD, &ss->rx_big.cnt,
1587			       0, "rx_small_cnt");
1588		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1589			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1590			       0, "number of lro merge queues flushed");
1591
1592		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1593			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1594			       0, "number of bad csums preventing LRO");
1595
1596		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1597			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1598			       0, "number of frames appended to lro merge"
1599			       "queues");
1600
1601		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1602			       "tx_req",
1603			       CTLFLAG_RD, &ss->tx.req,
1604			       0, "tx_req");
1605
1606		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1607			       "tx_done",
1608			       CTLFLAG_RD, &ss->tx.done,
1609			       0, "tx_done");
1610		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1611			       "tx_pkt_done",
1612			       CTLFLAG_RD, &ss->tx.pkt_done,
1613			       0, "tx_done");
1614		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1615			       "tx_stall",
1616			       CTLFLAG_RD, &ss->tx.stall,
1617			       0, "tx_stall");
1618		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1619			       "tx_wake",
1620			       CTLFLAG_RD, &ss->tx.wake,
1621			       0, "tx_wake");
1622		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1623			       "tx_defrag",
1624			       CTLFLAG_RD, &ss->tx.defrag,
1625			       0, "tx_defrag");
1626		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1627			       "tx_queue_active",
1628			       CTLFLAG_RD, &ss->tx.queue_active,
1629			       0, "tx_queue_active");
1630		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1631			       "tx_activate",
1632			       CTLFLAG_RD, &ss->tx.activate,
1633			       0, "tx_activate");
1634		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1635			       "tx_deactivate",
1636			       CTLFLAG_RD, &ss->tx.deactivate,
1637			       0, "tx_deactivate");
1638	}
1639}
1640
1641/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1642   backwards one at a time and handle ring wraps */
1643
1644static inline void
1645mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1646			    mcp_kreq_ether_send_t *src, int cnt)
1647{
1648	int idx, starting_slot;
1649	starting_slot = tx->req;
1650	while (cnt > 1) {
1651		cnt--;
1652		idx = (starting_slot + cnt) & tx->mask;
1653		mxge_pio_copy(&tx->lanai[idx],
1654			      &src[cnt], sizeof(*src));
1655		wmb();
1656	}
1657}
1658
1659/*
1660 * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1661 * at most 32 bytes at a time, so as to avoid involving the software
1662 * pio handler in the nic.   We re-write the first segment's flags
1663 * to mark them valid only after writing the entire chain
1664 */
1665
1666static inline void
1667mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1668		  int cnt)
1669{
1670	int idx, i;
1671	uint32_t *src_ints;
1672	volatile uint32_t *dst_ints;
1673	mcp_kreq_ether_send_t *srcp;
1674	volatile mcp_kreq_ether_send_t *dstp, *dst;
1675	uint8_t last_flags;
1676
1677	idx = tx->req & tx->mask;
1678
1679	last_flags = src->flags;
1680	src->flags = 0;
1681	wmb();
1682	dst = dstp = &tx->lanai[idx];
1683	srcp = src;
1684
1685	if ((idx + cnt) < tx->mask) {
1686		for (i = 0; i < (cnt - 1); i += 2) {
1687			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1688			wmb(); /* force write every 32 bytes */
1689			srcp += 2;
1690			dstp += 2;
1691		}
1692	} else {
1693		/* submit all but the first request, and ensure
1694		   that it is submitted below */
1695		mxge_submit_req_backwards(tx, src, cnt);
1696		i = 0;
1697	}
1698	if (i < cnt) {
1699		/* submit the first request */
1700		mxge_pio_copy(dstp, srcp, sizeof(*src));
1701		wmb(); /* barrier before setting valid flag */
1702	}
1703
1704	/* re-write the last 32-bits with the valid flags */
1705	src->flags = last_flags;
1706	src_ints = (uint32_t *)src;
1707	src_ints+=3;
1708	dst_ints = (volatile uint32_t *)dst;
1709	dst_ints+=3;
1710	*dst_ints =  *src_ints;
1711	tx->req += cnt;
1712	wmb();
1713}
1714
1715static int
1716mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1717    struct mxge_pkt_info *pi)
1718{
1719	struct ether_vlan_header *eh;
1720	uint16_t etype;
1721	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1722#if IFCAP_TSO6 && defined(INET6)
1723	int nxt;
1724#endif
1725
1726	eh = mtod(m, struct ether_vlan_header *);
1727	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1728		etype = ntohs(eh->evl_proto);
1729		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1730	} else {
1731		etype = ntohs(eh->evl_encap_proto);
1732		pi->ip_off = ETHER_HDR_LEN;
1733	}
1734
1735	switch (etype) {
1736	case ETHERTYPE_IP:
1737		/*
1738		 * ensure ip header is in first mbuf, copy it to a
1739		 * scratch buffer if not
1740		 */
1741		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1742		pi->ip6 = NULL;
1743		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1744			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1745			    ss->scratch);
1746			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1747		}
1748		pi->ip_hlen = pi->ip->ip_hl << 2;
1749		if (!tso)
1750			return 0;
1751
1752		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1753		    sizeof(struct tcphdr))) {
1754			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1755			    sizeof(struct tcphdr), ss->scratch);
1756			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1757		}
1758		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1759		break;
1760#if IFCAP_TSO6 && defined(INET6)
1761	case ETHERTYPE_IPV6:
1762		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1763		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1764			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1765			    ss->scratch);
1766			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1767		}
1768		nxt = 0;
1769		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1770		pi->ip_hlen -= pi->ip_off;
1771		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1772			return EINVAL;
1773
1774		if (!tso)
1775			return 0;
1776
1777		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1778			return EINVAL;
1779
1780		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1781		    sizeof(struct tcphdr))) {
1782			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1783			    sizeof(struct tcphdr), ss->scratch);
1784			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1785		}
1786		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1787		break;
1788#endif
1789	default:
1790		return EINVAL;
1791	}
1792	return 0;
1793}
1794
1795#if IFCAP_TSO4
1796
1797static void
1798mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1799	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1800{
1801	mxge_tx_ring_t *tx;
1802	mcp_kreq_ether_send_t *req;
1803	bus_dma_segment_t *seg;
1804	uint32_t low, high_swapped;
1805	int len, seglen, cum_len, cum_len_next;
1806	int next_is_first, chop, cnt, rdma_count, small;
1807	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1808	uint8_t flags, flags_next;
1809	static int once;
1810
1811	mss = m->m_pkthdr.tso_segsz;
1812
1813	/* negative cum_len signifies to the
1814	 * send loop that we are still in the
1815	 * header portion of the TSO packet.
1816	 */
1817
1818	cksum_offset = pi->ip_off + pi->ip_hlen;
1819	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1820
1821	/* TSO implies checksum offload on this hardware */
1822	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1823		/*
1824		 * If packet has full TCP csum, replace it with pseudo hdr
1825		 * sum that the NIC expects, otherwise the NIC will emit
1826		 * packets with bad TCP checksums.
1827		 */
1828		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1829		if (pi->ip6) {
1830#if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1831			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1832			sum = in6_cksum_pseudo(pi->ip6,
1833			    m->m_pkthdr.len - cksum_offset,
1834			    IPPROTO_TCP, 0);
1835#endif
1836		} else {
1837#ifdef INET
1838			m->m_pkthdr.csum_flags |= CSUM_TCP;
1839			sum = in_pseudo(pi->ip->ip_src.s_addr,
1840			    pi->ip->ip_dst.s_addr,
1841			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1842				    cksum_offset)));
1843#endif
1844		}
1845		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1846		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1847	}
1848	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1849
1850	/* for TSO, pseudo_hdr_offset holds mss.
1851	 * The firmware figures out where to put
1852	 * the checksum by parsing the header. */
1853	pseudo_hdr_offset = htobe16(mss);
1854
1855	if (pi->ip6) {
1856		/*
1857		 * for IPv6 TSO, the "checksum offset" is re-purposed
1858		 * to store the TCP header len
1859		 */
1860		cksum_offset = (pi->tcp->th_off << 2);
1861	}
1862
1863	tx = &ss->tx;
1864	req = tx->req_list;
1865	seg = tx->seg_list;
1866	cnt = 0;
1867	rdma_count = 0;
1868	/* "rdma_count" is the number of RDMAs belonging to the
1869	 * current packet BEFORE the current send request. For
1870	 * non-TSO packets, this is equal to "count".
1871	 * For TSO packets, rdma_count needs to be reset
1872	 * to 0 after a segment cut.
1873	 *
1874	 * The rdma_count field of the send request is
1875	 * the number of RDMAs of the packet starting at
1876	 * that request. For TSO send requests with one ore more cuts
1877	 * in the middle, this is the number of RDMAs starting
1878	 * after the last cut in the request. All previous
1879	 * segments before the last cut implicitly have 1 RDMA.
1880	 *
1881	 * Since the number of RDMAs is not known beforehand,
1882	 * it must be filled-in retroactively - after each
1883	 * segmentation cut or at the end of the entire packet.
1884	 */
1885
1886	while (busdma_seg_cnt) {
1887		/* Break the busdma segment up into pieces*/
1888		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1889		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1890		len = seg->ds_len;
1891
1892		while (len) {
1893			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1894			seglen = len;
1895			cum_len_next = cum_len + seglen;
1896			(req-rdma_count)->rdma_count = rdma_count + 1;
1897			if (__predict_true(cum_len >= 0)) {
1898				/* payload */
1899				chop = (cum_len_next > mss);
1900				cum_len_next = cum_len_next % mss;
1901				next_is_first = (cum_len_next == 0);
1902				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1903				flags_next |= next_is_first *
1904					MXGEFW_FLAGS_FIRST;
1905				rdma_count |= -(chop | next_is_first);
1906				rdma_count += chop & !next_is_first;
1907			} else if (cum_len_next >= 0) {
1908				/* header ends */
1909				rdma_count = -1;
1910				cum_len_next = 0;
1911				seglen = -cum_len;
1912				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1913				flags_next = MXGEFW_FLAGS_TSO_PLD |
1914					MXGEFW_FLAGS_FIRST |
1915					(small * MXGEFW_FLAGS_SMALL);
1916			    }
1917
1918			req->addr_high = high_swapped;
1919			req->addr_low = htobe32(low);
1920			req->pseudo_hdr_offset = pseudo_hdr_offset;
1921			req->pad = 0;
1922			req->rdma_count = 1;
1923			req->length = htobe16(seglen);
1924			req->cksum_offset = cksum_offset;
1925			req->flags = flags | ((cum_len & 1) *
1926					      MXGEFW_FLAGS_ALIGN_ODD);
1927			low += seglen;
1928			len -= seglen;
1929			cum_len = cum_len_next;
1930			flags = flags_next;
1931			req++;
1932			cnt++;
1933			rdma_count++;
1934			if (cksum_offset != 0 && !pi->ip6) {
1935				if (__predict_false(cksum_offset > seglen))
1936					cksum_offset -= seglen;
1937				else
1938					cksum_offset = 0;
1939			}
1940			if (__predict_false(cnt > tx->max_desc))
1941				goto drop;
1942		}
1943		busdma_seg_cnt--;
1944		seg++;
1945	}
1946	(req-rdma_count)->rdma_count = rdma_count;
1947
1948	do {
1949		req--;
1950		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1951	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1952
1953	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1954	mxge_submit_req(tx, tx->req_list, cnt);
1955
1956	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1957		/* tell the NIC to start polling this slice */
1958		*tx->send_go = 1;
1959		tx->queue_active = 1;
1960		tx->activate++;
1961		wmb();
1962	}
1963
1964	return;
1965
1966drop:
1967	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1968	m_freem(m);
1969	ss->oerrors++;
1970	if (!once) {
1971		printf("tx->max_desc exceeded via TSO!\n");
1972		printf("mss = %d, %ld, %d!\n", mss,
1973		       (long)seg - (long)tx->seg_list, tx->max_desc);
1974		once = 1;
1975	}
1976	return;
1977
1978}
1979
1980#endif /* IFCAP_TSO4 */
1981
1982#ifdef MXGE_NEW_VLAN_API
1983/*
1984 * We reproduce the software vlan tag insertion from
1985 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1986 * vlan tag insertion. We need to advertise this in order to have the
1987 * vlan interface respect our csum offload flags.
1988 */
1989static struct mbuf *
1990mxge_vlan_tag_insert(struct mbuf *m)
1991{
1992	struct ether_vlan_header *evl;
1993
1994	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
1995	if (__predict_false(m == NULL))
1996		return NULL;
1997	if (m->m_len < sizeof(*evl)) {
1998		m = m_pullup(m, sizeof(*evl));
1999		if (__predict_false(m == NULL))
2000			return NULL;
2001	}
2002	/*
2003	 * Transform the Ethernet header into an Ethernet header
2004	 * with 802.1Q encapsulation.
2005	 */
2006	evl = mtod(m, struct ether_vlan_header *);
2007	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2008	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2009	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2010	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2011	m->m_flags &= ~M_VLANTAG;
2012	return m;
2013}
2014#endif /* MXGE_NEW_VLAN_API */
2015
2016static void
2017mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2018{
2019	struct mxge_pkt_info pi = {0,0,0,0};
2020	mxge_softc_t *sc;
2021	mcp_kreq_ether_send_t *req;
2022	bus_dma_segment_t *seg;
2023	struct mbuf *m_tmp;
2024	mxge_tx_ring_t *tx;
2025	int cnt, cum_len, err, i, idx, odd_flag;
2026	uint16_t pseudo_hdr_offset;
2027	uint8_t flags, cksum_offset;
2028
2029	sc = ss->sc;
2030	tx = &ss->tx;
2031
2032#ifdef MXGE_NEW_VLAN_API
2033	if (m->m_flags & M_VLANTAG) {
2034		m = mxge_vlan_tag_insert(m);
2035		if (__predict_false(m == NULL))
2036			goto drop_without_m;
2037	}
2038#endif
2039	if (m->m_pkthdr.csum_flags &
2040	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2041		if (mxge_parse_tx(ss, m, &pi))
2042			goto drop;
2043	}
2044
2045	/* (try to) map the frame for DMA */
2046	idx = tx->req & tx->mask;
2047	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2048				      m, tx->seg_list, &cnt,
2049				      BUS_DMA_NOWAIT);
2050	if (__predict_false(err == EFBIG)) {
2051		/* Too many segments in the chain.  Try
2052		   to defrag */
2053		m_tmp = m_defrag(m, M_NOWAIT);
2054		if (m_tmp == NULL) {
2055			goto drop;
2056		}
2057		ss->tx.defrag++;
2058		m = m_tmp;
2059		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2060					      tx->info[idx].map,
2061					      m, tx->seg_list, &cnt,
2062					      BUS_DMA_NOWAIT);
2063	}
2064	if (__predict_false(err != 0)) {
2065		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2066			      " packet len = %d\n", err, m->m_pkthdr.len);
2067		goto drop;
2068	}
2069	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2070			BUS_DMASYNC_PREWRITE);
2071	tx->info[idx].m = m;
2072
2073#if IFCAP_TSO4
2074	/* TSO is different enough, we handle it in another routine */
2075	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2076		mxge_encap_tso(ss, m, cnt, &pi);
2077		return;
2078	}
2079#endif
2080
2081	req = tx->req_list;
2082	cksum_offset = 0;
2083	pseudo_hdr_offset = 0;
2084	flags = MXGEFW_FLAGS_NO_TSO;
2085
2086	/* checksum offloading? */
2087	if (m->m_pkthdr.csum_flags &
2088	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2089		/* ensure ip header is in first mbuf, copy
2090		   it to a scratch buffer if not */
2091		cksum_offset = pi.ip_off + pi.ip_hlen;
2092		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2093		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2094		req->cksum_offset = cksum_offset;
2095		flags |= MXGEFW_FLAGS_CKSUM;
2096		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2097	} else {
2098		odd_flag = 0;
2099	}
2100	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2101		flags |= MXGEFW_FLAGS_SMALL;
2102
2103	/* convert segments into a request list */
2104	cum_len = 0;
2105	seg = tx->seg_list;
2106	req->flags = MXGEFW_FLAGS_FIRST;
2107	for (i = 0; i < cnt; i++) {
2108		req->addr_low =
2109			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2110		req->addr_high =
2111			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2112		req->length = htobe16(seg->ds_len);
2113		req->cksum_offset = cksum_offset;
2114		if (cksum_offset > seg->ds_len)
2115			cksum_offset -= seg->ds_len;
2116		else
2117			cksum_offset = 0;
2118		req->pseudo_hdr_offset = pseudo_hdr_offset;
2119		req->pad = 0; /* complete solid 16-byte block */
2120		req->rdma_count = 1;
2121		req->flags |= flags | ((cum_len & 1) * odd_flag);
2122		cum_len += seg->ds_len;
2123		seg++;
2124		req++;
2125		req->flags = 0;
2126	}
2127	req--;
2128	/* pad runts to 60 bytes */
2129	if (cum_len < 60) {
2130		req++;
2131		req->addr_low =
2132			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2133		req->addr_high =
2134			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2135		req->length = htobe16(60 - cum_len);
2136		req->cksum_offset = 0;
2137		req->pseudo_hdr_offset = pseudo_hdr_offset;
2138		req->pad = 0; /* complete solid 16-byte block */
2139		req->rdma_count = 1;
2140		req->flags |= flags | ((cum_len & 1) * odd_flag);
2141		cnt++;
2142	}
2143
2144	tx->req_list[0].rdma_count = cnt;
2145#if 0
2146	/* print what the firmware will see */
2147	for (i = 0; i < cnt; i++) {
2148		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2149		    "cso:%d, flags:0x%x, rdma:%d\n",
2150		    i, (int)ntohl(tx->req_list[i].addr_high),
2151		    (int)ntohl(tx->req_list[i].addr_low),
2152		    (int)ntohs(tx->req_list[i].length),
2153		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2154		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2155		    tx->req_list[i].rdma_count);
2156	}
2157	printf("--------------\n");
2158#endif
2159	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2160	mxge_submit_req(tx, tx->req_list, cnt);
2161
2162	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2163		/* tell the NIC to start polling this slice */
2164		*tx->send_go = 1;
2165		tx->queue_active = 1;
2166		tx->activate++;
2167		wmb();
2168	}
2169
2170	return;
2171
2172drop:
2173	m_freem(m);
2174drop_without_m:
2175	ss->oerrors++;
2176	return;
2177}
2178
2179static void
2180mxge_qflush(if_t ifp)
2181{
2182	mxge_softc_t *sc = if_getsoftc(ifp);
2183	mxge_tx_ring_t *tx;
2184	struct mbuf *m;
2185	int slice;
2186
2187	for (slice = 0; slice < sc->num_slices; slice++) {
2188		tx = &sc->ss[slice].tx;
2189		mtx_lock(&tx->mtx);
2190		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2191			m_freem(m);
2192		mtx_unlock(&tx->mtx);
2193	}
2194	if_qflush(ifp);
2195}
2196
2197static inline void
2198mxge_start_locked(struct mxge_slice_state *ss)
2199{
2200	mxge_softc_t *sc;
2201	struct mbuf *m;
2202	if_t ifp;
2203	mxge_tx_ring_t *tx;
2204
2205	sc = ss->sc;
2206	ifp = sc->ifp;
2207	tx = &ss->tx;
2208
2209	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2210		m = drbr_dequeue(ifp, tx->br);
2211		if (m == NULL) {
2212			return;
2213		}
2214		/* let BPF see it */
2215		BPF_MTAP(ifp, m);
2216
2217		/* give it to the nic */
2218		mxge_encap(ss, m);
2219	}
2220	/* ran out of transmit slots */
2221	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2222	    && (!drbr_empty(ifp, tx->br))) {
2223		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2224		tx->stall++;
2225	}
2226}
2227
2228static int
2229mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2230{
2231	mxge_softc_t *sc;
2232	if_t ifp;
2233	mxge_tx_ring_t *tx;
2234	int err;
2235
2236	sc = ss->sc;
2237	ifp = sc->ifp;
2238	tx = &ss->tx;
2239
2240	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2241	    IFF_DRV_RUNNING) {
2242		err = drbr_enqueue(ifp, tx->br, m);
2243		return (err);
2244	}
2245
2246	if (!drbr_needs_enqueue(ifp, tx->br) &&
2247	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2248		/* let BPF see it */
2249		BPF_MTAP(ifp, m);
2250		/* give it to the nic */
2251		mxge_encap(ss, m);
2252	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2253		return (err);
2254	}
2255	if (!drbr_empty(ifp, tx->br))
2256		mxge_start_locked(ss);
2257	return (0);
2258}
2259
2260static int
2261mxge_transmit(if_t ifp, struct mbuf *m)
2262{
2263	mxge_softc_t *sc = if_getsoftc(ifp);
2264	struct mxge_slice_state *ss;
2265	mxge_tx_ring_t *tx;
2266	int err = 0;
2267	int slice;
2268
2269	slice = m->m_pkthdr.flowid;
2270	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2271
2272	ss = &sc->ss[slice];
2273	tx = &ss->tx;
2274
2275	if (mtx_trylock(&tx->mtx)) {
2276		err = mxge_transmit_locked(ss, m);
2277		mtx_unlock(&tx->mtx);
2278	} else {
2279		err = drbr_enqueue(ifp, tx->br, m);
2280	}
2281
2282	return (err);
2283}
2284
2285static void
2286mxge_start(if_t ifp)
2287{
2288	mxge_softc_t *sc = if_getsoftc(ifp);
2289	struct mxge_slice_state *ss;
2290
2291	/* only use the first slice for now */
2292	ss = &sc->ss[0];
2293	mtx_lock(&ss->tx.mtx);
2294	mxge_start_locked(ss);
2295	mtx_unlock(&ss->tx.mtx);
2296}
2297
2298/*
2299 * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2300 * at most 32 bytes at a time, so as to avoid involving the software
2301 * pio handler in the nic.   We re-write the first segment's low
2302 * DMA address to mark it valid only after we write the entire chunk
2303 * in a burst
2304 */
2305static inline void
2306mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2307		mcp_kreq_ether_recv_t *src)
2308{
2309	uint32_t low;
2310
2311	low = src->addr_low;
2312	src->addr_low = 0xffffffff;
2313	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2314	wmb();
2315	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2316	wmb();
2317	src->addr_low = low;
2318	dst->addr_low = low;
2319	wmb();
2320}
2321
2322static int
2323mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2324{
2325	bus_dma_segment_t seg;
2326	struct mbuf *m;
2327	mxge_rx_ring_t *rx = &ss->rx_small;
2328	int cnt, err;
2329
2330	m = m_gethdr(M_NOWAIT, MT_DATA);
2331	if (m == NULL) {
2332		rx->alloc_fail++;
2333		err = ENOBUFS;
2334		goto done;
2335	}
2336	m->m_len = MHLEN;
2337	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2338				      &seg, &cnt, BUS_DMA_NOWAIT);
2339	if (err != 0) {
2340		m_free(m);
2341		goto done;
2342	}
2343	rx->info[idx].m = m;
2344	rx->shadow[idx].addr_low =
2345		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2346	rx->shadow[idx].addr_high =
2347		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2348
2349done:
2350	if ((idx & 7) == 7)
2351		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2352	return err;
2353}
2354
2355static int
2356mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2357{
2358	bus_dma_segment_t seg[3];
2359	struct mbuf *m;
2360	mxge_rx_ring_t *rx = &ss->rx_big;
2361	int cnt, err, i;
2362
2363	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2364	if (m == NULL) {
2365		rx->alloc_fail++;
2366		err = ENOBUFS;
2367		goto done;
2368	}
2369	m->m_len = rx->mlen;
2370	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2371				      seg, &cnt, BUS_DMA_NOWAIT);
2372	if (err != 0) {
2373		m_free(m);
2374		goto done;
2375	}
2376	rx->info[idx].m = m;
2377	rx->shadow[idx].addr_low =
2378		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2379	rx->shadow[idx].addr_high =
2380		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2381
2382done:
2383       for (i = 0; i < rx->nbufs; i++) {
2384		if ((idx & 7) == 7) {
2385			mxge_submit_8rx(&rx->lanai[idx - 7],
2386					&rx->shadow[idx - 7]);
2387		}
2388		idx++;
2389	}
2390	return err;
2391}
2392
2393#ifdef INET6
2394
2395static uint16_t
2396mxge_csum_generic(uint16_t *raw, int len)
2397{
2398	uint32_t csum;
2399
2400	csum = 0;
2401	while (len > 0) {
2402		csum += *raw;
2403		raw++;
2404		len -= 2;
2405	}
2406	csum = (csum >> 16) + (csum & 0xffff);
2407	csum = (csum >> 16) + (csum & 0xffff);
2408	return (uint16_t)csum;
2409}
2410
2411static inline uint16_t
2412mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2413{
2414	uint32_t partial;
2415	int nxt, cksum_offset;
2416	struct ip6_hdr *ip6 = p;
2417	uint16_t c;
2418
2419	nxt = ip6->ip6_nxt;
2420	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2421	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2422		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2423					   IPPROTO_IPV6, &nxt);
2424		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2425			return (1);
2426	}
2427
2428	/*
2429	 * IPv6 headers do not contain a checksum, and hence
2430	 * do not checksum to zero, so they don't "fall out"
2431	 * of the partial checksum calculation like IPv4
2432	 * headers do.  We need to fix the partial checksum by
2433	 * subtracting the checksum of the IPv6 header.
2434	 */
2435
2436	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2437				    ETHER_HDR_LEN);
2438	csum += ~partial;
2439	csum +=	 (csum < ~partial);
2440	csum = (csum >> 16) + (csum & 0xFFFF);
2441	csum = (csum >> 16) + (csum & 0xFFFF);
2442	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2443			     csum);
2444	c ^= 0xffff;
2445	return (c);
2446}
2447#endif /* INET6 */
2448/*
2449 *  Myri10GE hardware checksums are not valid if the sender
2450 *  padded the frame with non-zero padding.  This is because
2451 *  the firmware just does a simple 16-bit 1s complement
2452 *  checksum across the entire frame, excluding the first 14
2453 *  bytes.  It is best to simply to check the checksum and
2454 *  tell the stack about it only if the checksum is good
2455 */
2456
2457static inline uint16_t
2458mxge_rx_csum(struct mbuf *m, int csum)
2459{
2460	struct ether_header *eh;
2461#ifdef INET
2462	struct ip *ip;
2463#endif
2464#if defined(INET) || defined(INET6)
2465	int cap = if_getcapenable(m->m_pkthdr.rcvif);
2466#endif
2467	uint16_t c, etype;
2468
2469	eh = mtod(m, struct ether_header *);
2470	etype = ntohs(eh->ether_type);
2471	switch (etype) {
2472#ifdef INET
2473	case ETHERTYPE_IP:
2474		if ((cap & IFCAP_RXCSUM) == 0)
2475			return (1);
2476		ip = (struct ip *)(eh + 1);
2477		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2478			return (1);
2479		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2480			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2481				    (ip->ip_hl << 2) + ip->ip_p));
2482		c ^= 0xffff;
2483		break;
2484#endif
2485#ifdef INET6
2486	case ETHERTYPE_IPV6:
2487		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2488			return (1);
2489		c = mxge_rx_csum6((eh + 1), m, csum);
2490		break;
2491#endif
2492	default:
2493		c = 1;
2494	}
2495	return (c);
2496}
2497
2498static void
2499mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2500{
2501	struct ether_vlan_header *evl;
2502	uint32_t partial;
2503
2504	evl = mtod(m, struct ether_vlan_header *);
2505
2506	/*
2507	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2508	 * after what the firmware thought was the end of the ethernet
2509	 * header.
2510	 */
2511
2512	/* put checksum into host byte order */
2513	*csum = ntohs(*csum);
2514	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2515	(*csum) += ~partial;
2516	(*csum) +=  ((*csum) < ~partial);
2517	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2518	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2519
2520	/* restore checksum to network byte order;
2521	   later consumers expect this */
2522	*csum = htons(*csum);
2523
2524	/* save the tag */
2525#ifdef MXGE_NEW_VLAN_API
2526	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2527#else
2528	{
2529		struct m_tag *mtag;
2530		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2531				   M_NOWAIT);
2532		if (mtag == NULL)
2533			return;
2534		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2535		m_tag_prepend(m, mtag);
2536	}
2537
2538#endif
2539	m->m_flags |= M_VLANTAG;
2540
2541	/*
2542	 * Remove the 802.1q header by copying the Ethernet
2543	 * addresses over it and adjusting the beginning of
2544	 * the data in the mbuf.  The encapsulated Ethernet
2545	 * type field is already in place.
2546	 */
2547	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2548	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2549	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2550}
2551
2552static inline void
2553mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2554		 uint32_t csum, int lro)
2555{
2556	mxge_softc_t *sc;
2557	if_t ifp;
2558	struct mbuf *m;
2559	struct ether_header *eh;
2560	mxge_rx_ring_t *rx;
2561	bus_dmamap_t old_map;
2562	int idx;
2563
2564	sc = ss->sc;
2565	ifp = sc->ifp;
2566	rx = &ss->rx_big;
2567	idx = rx->cnt & rx->mask;
2568	rx->cnt += rx->nbufs;
2569	/* save a pointer to the received mbuf */
2570	m = rx->info[idx].m;
2571	/* try to replace the received mbuf */
2572	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2573		/* drop the frame -- the old mbuf is re-cycled */
2574		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2575		return;
2576	}
2577
2578	/* unmap the received buffer */
2579	old_map = rx->info[idx].map;
2580	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2581	bus_dmamap_unload(rx->dmat, old_map);
2582
2583	/* swap the bus_dmamap_t's */
2584	rx->info[idx].map = rx->extra_map;
2585	rx->extra_map = old_map;
2586
2587	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2588	 * aligned */
2589	m->m_data += MXGEFW_PAD;
2590
2591	m->m_pkthdr.rcvif = ifp;
2592	m->m_len = m->m_pkthdr.len = len;
2593	ss->ipackets++;
2594	eh = mtod(m, struct ether_header *);
2595	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2596		mxge_vlan_tag_remove(m, &csum);
2597	}
2598	/* flowid only valid if RSS hashing is enabled */
2599	if (sc->num_slices > 1) {
2600		m->m_pkthdr.flowid = (ss - sc->ss);
2601		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2602	}
2603	/* if the checksum is valid, mark it in the mbuf header */
2604	if ((if_getcapenable(ifp) & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2605	    (0 == mxge_rx_csum(m, csum))) {
2606		/* Tell the stack that the  checksum is good */
2607		m->m_pkthdr.csum_data = 0xffff;
2608		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2609			CSUM_DATA_VALID;
2610
2611#if defined(INET) || defined (INET6)
2612		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2613			return;
2614#endif
2615	}
2616	/* pass the frame up the stack */
2617	if_input(ifp, m);
2618}
2619
2620static inline void
2621mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2622		   uint32_t csum, int lro)
2623{
2624	mxge_softc_t *sc;
2625	if_t ifp;
2626	struct ether_header *eh;
2627	struct mbuf *m;
2628	mxge_rx_ring_t *rx;
2629	bus_dmamap_t old_map;
2630	int idx;
2631
2632	sc = ss->sc;
2633	ifp = sc->ifp;
2634	rx = &ss->rx_small;
2635	idx = rx->cnt & rx->mask;
2636	rx->cnt++;
2637	/* save a pointer to the received mbuf */
2638	m = rx->info[idx].m;
2639	/* try to replace the received mbuf */
2640	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2641		/* drop the frame -- the old mbuf is re-cycled */
2642		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2643		return;
2644	}
2645
2646	/* unmap the received buffer */
2647	old_map = rx->info[idx].map;
2648	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2649	bus_dmamap_unload(rx->dmat, old_map);
2650
2651	/* swap the bus_dmamap_t's */
2652	rx->info[idx].map = rx->extra_map;
2653	rx->extra_map = old_map;
2654
2655	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2656	 * aligned */
2657	m->m_data += MXGEFW_PAD;
2658
2659	m->m_pkthdr.rcvif = ifp;
2660	m->m_len = m->m_pkthdr.len = len;
2661	ss->ipackets++;
2662	eh = mtod(m, struct ether_header *);
2663	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2664		mxge_vlan_tag_remove(m, &csum);
2665	}
2666	/* flowid only valid if RSS hashing is enabled */
2667	if (sc->num_slices > 1) {
2668		m->m_pkthdr.flowid = (ss - sc->ss);
2669		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2670	}
2671	/* if the checksum is valid, mark it in the mbuf header */
2672	if ((if_getcapenable(ifp) & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2673	    (0 == mxge_rx_csum(m, csum))) {
2674		/* Tell the stack that the  checksum is good */
2675		m->m_pkthdr.csum_data = 0xffff;
2676		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2677			CSUM_DATA_VALID;
2678
2679#if defined(INET) || defined (INET6)
2680		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2681			return;
2682#endif
2683	}
2684	/* pass the frame up the stack */
2685	if_input(ifp, m);
2686}
2687
2688static inline void
2689mxge_clean_rx_done(struct mxge_slice_state *ss)
2690{
2691	mxge_rx_done_t *rx_done = &ss->rx_done;
2692	int limit = 0;
2693	uint16_t length;
2694	uint16_t checksum;
2695	int lro;
2696
2697	lro = if_getcapenable(ss->sc->ifp) & IFCAP_LRO;
2698	while (rx_done->entry[rx_done->idx].length != 0) {
2699		length = ntohs(rx_done->entry[rx_done->idx].length);
2700		rx_done->entry[rx_done->idx].length = 0;
2701		checksum = rx_done->entry[rx_done->idx].checksum;
2702		if (length <= (MHLEN - MXGEFW_PAD))
2703			mxge_rx_done_small(ss, length, checksum, lro);
2704		else
2705			mxge_rx_done_big(ss, length, checksum, lro);
2706		rx_done->cnt++;
2707		rx_done->idx = rx_done->cnt & rx_done->mask;
2708
2709		/* limit potential for livelock */
2710		if (__predict_false(++limit > rx_done->mask / 2))
2711			break;
2712	}
2713#if defined(INET)  || defined (INET6)
2714	tcp_lro_flush_all(&ss->lc);
2715#endif
2716}
2717
2718static inline void
2719mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2720{
2721	if_t ifp __unused;
2722	mxge_tx_ring_t *tx;
2723	struct mbuf *m;
2724	bus_dmamap_t map;
2725	int idx;
2726	int *flags;
2727
2728	tx = &ss->tx;
2729	ifp = ss->sc->ifp;
2730	while (tx->pkt_done != mcp_idx) {
2731		idx = tx->done & tx->mask;
2732		tx->done++;
2733		m = tx->info[idx].m;
2734		/* mbuf and DMA map only attached to the first
2735		   segment per-mbuf */
2736		if (m != NULL) {
2737			ss->obytes += m->m_pkthdr.len;
2738			if (m->m_flags & M_MCAST)
2739				ss->omcasts++;
2740			ss->opackets++;
2741			tx->info[idx].m = NULL;
2742			map = tx->info[idx].map;
2743			bus_dmamap_unload(tx->dmat, map);
2744			m_freem(m);
2745		}
2746		if (tx->info[idx].flag) {
2747			tx->info[idx].flag = 0;
2748			tx->pkt_done++;
2749		}
2750	}
2751
2752	/* If we have space, clear IFF_OACTIVE to tell the stack that
2753	   its OK to send packets */
2754	flags = &ss->if_drv_flags;
2755	mtx_lock(&ss->tx.mtx);
2756	if ((*flags) & IFF_DRV_OACTIVE &&
2757	    tx->req - tx->done < (tx->mask + 1)/4) {
2758		*(flags) &= ~IFF_DRV_OACTIVE;
2759		ss->tx.wake++;
2760		mxge_start_locked(ss);
2761	}
2762	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2763		/* let the NIC stop polling this queue, since there
2764		 * are no more transmits pending */
2765		if (tx->req == tx->done) {
2766			*tx->send_stop = 1;
2767			tx->queue_active = 0;
2768			tx->deactivate++;
2769			wmb();
2770		}
2771	}
2772	mtx_unlock(&ss->tx.mtx);
2773}
2774
2775static struct mxge_media_type mxge_xfp_media_types[] =
2776{
2777	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2778	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2779	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2780	{0,		(1 << 5),	"10GBASE-ER"},
2781	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2782	{0,		(1 << 3),	"10GBASE-SW"},
2783	{0,		(1 << 2),	"10GBASE-LW"},
2784	{0,		(1 << 1),	"10GBASE-EW"},
2785	{0,		(1 << 0),	"Reserved"}
2786};
2787static struct mxge_media_type mxge_sfp_media_types[] =
2788{
2789	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2790	{0,		(1 << 7),	"Reserved"},
2791	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2792	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2793	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2794	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2795};
2796
2797static void
2798mxge_media_set(mxge_softc_t *sc, int media_type)
2799{
2800
2801	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2802		    0, NULL);
2803	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2804	sc->current_media = media_type;
2805	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2806}
2807
2808static void
2809mxge_media_init(mxge_softc_t *sc)
2810{
2811	char *ptr;
2812	int i;
2813
2814	ifmedia_removeall(&sc->media);
2815	mxge_media_set(sc, IFM_AUTO);
2816
2817	/*
2818	 * parse the product code to deterimine the interface type
2819	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2820	 * after the 3rd dash in the driver's cached copy of the
2821	 * EEPROM's product code string.
2822	 */
2823	ptr = sc->product_code_string;
2824	if (ptr == NULL) {
2825		device_printf(sc->dev, "Missing product code\n");
2826		return;
2827	}
2828
2829	for (i = 0; i < 3; i++, ptr++) {
2830		ptr = strchr(ptr, '-');
2831		if (ptr == NULL) {
2832			device_printf(sc->dev,
2833				      "only %d dashes in PC?!?\n", i);
2834			return;
2835		}
2836	}
2837	if (*ptr == 'C' || *(ptr +1) == 'C') {
2838		/* -C is CX4 */
2839		sc->connector = MXGE_CX4;
2840		mxge_media_set(sc, IFM_10G_CX4);
2841	} else if (*ptr == 'Q') {
2842		/* -Q is Quad Ribbon Fiber */
2843		sc->connector = MXGE_QRF;
2844		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2845		/* FreeBSD has no media type for Quad ribbon fiber */
2846	} else if (*ptr == 'R') {
2847		/* -R is XFP */
2848		sc->connector = MXGE_XFP;
2849	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2850		/* -S or -2S is SFP+ */
2851		sc->connector = MXGE_SFP;
2852	} else {
2853		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2854	}
2855}
2856
2857/*
2858 * Determine the media type for a NIC.  Some XFPs will identify
2859 * themselves only when their link is up, so this is initiated via a
2860 * link up interrupt.  However, this can potentially take up to
2861 * several milliseconds, so it is run via the watchdog routine, rather
2862 * than in the interrupt handler itself.
2863 */
2864static void
2865mxge_media_probe(mxge_softc_t *sc)
2866{
2867	mxge_cmd_t cmd;
2868	char *cage_type;
2869
2870	struct mxge_media_type *mxge_media_types = NULL;
2871	int i, err, ms, mxge_media_type_entries;
2872	uint32_t byte;
2873
2874	sc->need_media_probe = 0;
2875
2876	if (sc->connector == MXGE_XFP) {
2877		/* -R is XFP */
2878		mxge_media_types = mxge_xfp_media_types;
2879		mxge_media_type_entries =
2880			nitems(mxge_xfp_media_types);
2881		byte = MXGE_XFP_COMPLIANCE_BYTE;
2882		cage_type = "XFP";
2883	} else 	if (sc->connector == MXGE_SFP) {
2884		/* -S or -2S is SFP+ */
2885		mxge_media_types = mxge_sfp_media_types;
2886		mxge_media_type_entries =
2887			nitems(mxge_sfp_media_types);
2888		cage_type = "SFP+";
2889		byte = 3;
2890	} else {
2891		/* nothing to do; media type cannot change */
2892		return;
2893	}
2894
2895	/*
2896	 * At this point we know the NIC has an XFP cage, so now we
2897	 * try to determine what is in the cage by using the
2898	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2899	 * register.  We read just one byte, which may take over
2900	 * a millisecond
2901	 */
2902
2903	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2904	cmd.data1 = byte;
2905	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2906	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2907		device_printf(sc->dev, "failed to read XFP\n");
2908	}
2909	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2910		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2911	}
2912	if (err != MXGEFW_CMD_OK) {
2913		return;
2914	}
2915
2916	/* now we wait for the data to be cached */
2917	cmd.data0 = byte;
2918	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2919	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2920		DELAY(1000);
2921		cmd.data0 = byte;
2922		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2923	}
2924	if (err != MXGEFW_CMD_OK) {
2925		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2926			      cage_type, err, ms);
2927		return;
2928	}
2929
2930	if (cmd.data0 == mxge_media_types[0].bitmask) {
2931		if (mxge_verbose)
2932			device_printf(sc->dev, "%s:%s\n", cage_type,
2933				      mxge_media_types[0].name);
2934		if (sc->current_media != mxge_media_types[0].flag) {
2935			mxge_media_init(sc);
2936			mxge_media_set(sc, mxge_media_types[0].flag);
2937		}
2938		return;
2939	}
2940	for (i = 1; i < mxge_media_type_entries; i++) {
2941		if (cmd.data0 & mxge_media_types[i].bitmask) {
2942			if (mxge_verbose)
2943				device_printf(sc->dev, "%s:%s\n",
2944					      cage_type,
2945					      mxge_media_types[i].name);
2946
2947			if (sc->current_media != mxge_media_types[i].flag) {
2948				mxge_media_init(sc);
2949				mxge_media_set(sc, mxge_media_types[i].flag);
2950			}
2951			return;
2952		}
2953	}
2954	if (mxge_verbose)
2955		device_printf(sc->dev, "%s media 0x%x unknown\n",
2956			      cage_type, cmd.data0);
2957
2958	return;
2959}
2960
2961static void
2962mxge_intr(void *arg)
2963{
2964	struct mxge_slice_state *ss = arg;
2965	mxge_softc_t *sc = ss->sc;
2966	mcp_irq_data_t *stats = ss->fw_stats;
2967	mxge_tx_ring_t *tx = &ss->tx;
2968	mxge_rx_done_t *rx_done = &ss->rx_done;
2969	uint32_t send_done_count;
2970	uint8_t valid;
2971
2972	/* make sure the DMA has finished */
2973	if (!stats->valid) {
2974		return;
2975	}
2976	valid = stats->valid;
2977
2978	if (sc->legacy_irq) {
2979		/* lower legacy IRQ  */
2980		*sc->irq_deassert = 0;
2981		if (!mxge_deassert_wait)
2982			/* don't wait for conf. that irq is low */
2983			stats->valid = 0;
2984	} else {
2985		stats->valid = 0;
2986	}
2987
2988	/* loop while waiting for legacy irq deassertion */
2989	do {
2990		/* check for transmit completes and receives */
2991		send_done_count = be32toh(stats->send_done_count);
2992		while ((send_done_count != tx->pkt_done) ||
2993		       (rx_done->entry[rx_done->idx].length != 0)) {
2994			if (send_done_count != tx->pkt_done)
2995				mxge_tx_done(ss, (int)send_done_count);
2996			mxge_clean_rx_done(ss);
2997			send_done_count = be32toh(stats->send_done_count);
2998		}
2999		if (sc->legacy_irq && mxge_deassert_wait)
3000			wmb();
3001	} while (*((volatile uint8_t *) &stats->valid));
3002
3003	/* fw link & error stats meaningful only on the first slice */
3004	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3005		if (sc->link_state != stats->link_up) {
3006			sc->link_state = stats->link_up;
3007			if (sc->link_state) {
3008				if_link_state_change(sc->ifp, LINK_STATE_UP);
3009				if (mxge_verbose)
3010					device_printf(sc->dev, "link up\n");
3011			} else {
3012				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3013				if (mxge_verbose)
3014					device_printf(sc->dev, "link down\n");
3015			}
3016			sc->need_media_probe = 1;
3017		}
3018		if (sc->rdma_tags_available !=
3019		    be32toh(stats->rdma_tags_available)) {
3020			sc->rdma_tags_available =
3021				be32toh(stats->rdma_tags_available);
3022			device_printf(sc->dev, "RDMA timed out! %d tags "
3023				      "left\n", sc->rdma_tags_available);
3024		}
3025
3026		if (stats->link_down) {
3027			sc->down_cnt += stats->link_down;
3028			sc->link_state = 0;
3029			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3030		}
3031	}
3032
3033	/* check to see if we have rx token to pass back */
3034	if (valid & 0x1)
3035	    *ss->irq_claim = be32toh(3);
3036	*(ss->irq_claim + 1) = be32toh(3);
3037}
3038
3039static void
3040mxge_init(void *arg)
3041{
3042	mxge_softc_t *sc = arg;
3043	if_t ifp = sc->ifp;
3044
3045	mtx_lock(&sc->driver_mtx);
3046	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)
3047		(void) mxge_open(sc);
3048	mtx_unlock(&sc->driver_mtx);
3049}
3050
3051static void
3052mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3053{
3054	int i;
3055
3056#if defined(INET) || defined(INET6)
3057	tcp_lro_free(&ss->lc);
3058#endif
3059	for (i = 0; i <= ss->rx_big.mask; i++) {
3060		if (ss->rx_big.info[i].m == NULL)
3061			continue;
3062		bus_dmamap_unload(ss->rx_big.dmat,
3063				  ss->rx_big.info[i].map);
3064		m_freem(ss->rx_big.info[i].m);
3065		ss->rx_big.info[i].m = NULL;
3066	}
3067
3068	for (i = 0; i <= ss->rx_small.mask; i++) {
3069		if (ss->rx_small.info[i].m == NULL)
3070			continue;
3071		bus_dmamap_unload(ss->rx_small.dmat,
3072				  ss->rx_small.info[i].map);
3073		m_freem(ss->rx_small.info[i].m);
3074		ss->rx_small.info[i].m = NULL;
3075	}
3076
3077	/* transmit ring used only on the first slice */
3078	if (ss->tx.info == NULL)
3079		return;
3080
3081	for (i = 0; i <= ss->tx.mask; i++) {
3082		ss->tx.info[i].flag = 0;
3083		if (ss->tx.info[i].m == NULL)
3084			continue;
3085		bus_dmamap_unload(ss->tx.dmat,
3086				  ss->tx.info[i].map);
3087		m_freem(ss->tx.info[i].m);
3088		ss->tx.info[i].m = NULL;
3089	}
3090}
3091
3092static void
3093mxge_free_mbufs(mxge_softc_t *sc)
3094{
3095	int slice;
3096
3097	for (slice = 0; slice < sc->num_slices; slice++)
3098		mxge_free_slice_mbufs(&sc->ss[slice]);
3099}
3100
3101static void
3102mxge_free_slice_rings(struct mxge_slice_state *ss)
3103{
3104	int i;
3105
3106	if (ss->rx_done.entry != NULL)
3107		mxge_dma_free(&ss->rx_done.dma);
3108	ss->rx_done.entry = NULL;
3109
3110	if (ss->tx.req_bytes != NULL)
3111		free(ss->tx.req_bytes, M_DEVBUF);
3112	ss->tx.req_bytes = NULL;
3113
3114	if (ss->tx.seg_list != NULL)
3115		free(ss->tx.seg_list, M_DEVBUF);
3116	ss->tx.seg_list = NULL;
3117
3118	if (ss->rx_small.shadow != NULL)
3119		free(ss->rx_small.shadow, M_DEVBUF);
3120	ss->rx_small.shadow = NULL;
3121
3122	if (ss->rx_big.shadow != NULL)
3123		free(ss->rx_big.shadow, M_DEVBUF);
3124	ss->rx_big.shadow = NULL;
3125
3126	if (ss->tx.info != NULL) {
3127		if (ss->tx.dmat != NULL) {
3128			for (i = 0; i <= ss->tx.mask; i++) {
3129				bus_dmamap_destroy(ss->tx.dmat,
3130						   ss->tx.info[i].map);
3131			}
3132			bus_dma_tag_destroy(ss->tx.dmat);
3133		}
3134		free(ss->tx.info, M_DEVBUF);
3135	}
3136	ss->tx.info = NULL;
3137
3138	if (ss->rx_small.info != NULL) {
3139		if (ss->rx_small.dmat != NULL) {
3140			for (i = 0; i <= ss->rx_small.mask; i++) {
3141				bus_dmamap_destroy(ss->rx_small.dmat,
3142						   ss->rx_small.info[i].map);
3143			}
3144			bus_dmamap_destroy(ss->rx_small.dmat,
3145					   ss->rx_small.extra_map);
3146			bus_dma_tag_destroy(ss->rx_small.dmat);
3147		}
3148		free(ss->rx_small.info, M_DEVBUF);
3149	}
3150	ss->rx_small.info = NULL;
3151
3152	if (ss->rx_big.info != NULL) {
3153		if (ss->rx_big.dmat != NULL) {
3154			for (i = 0; i <= ss->rx_big.mask; i++) {
3155				bus_dmamap_destroy(ss->rx_big.dmat,
3156						   ss->rx_big.info[i].map);
3157			}
3158			bus_dmamap_destroy(ss->rx_big.dmat,
3159					   ss->rx_big.extra_map);
3160			bus_dma_tag_destroy(ss->rx_big.dmat);
3161		}
3162		free(ss->rx_big.info, M_DEVBUF);
3163	}
3164	ss->rx_big.info = NULL;
3165}
3166
3167static void
3168mxge_free_rings(mxge_softc_t *sc)
3169{
3170	int slice;
3171
3172	for (slice = 0; slice < sc->num_slices; slice++)
3173		mxge_free_slice_rings(&sc->ss[slice]);
3174}
3175
3176static int
3177mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3178		       int tx_ring_entries)
3179{
3180	mxge_softc_t *sc = ss->sc;
3181	size_t bytes;
3182	int err, i;
3183
3184	/* allocate per-slice receive resources */
3185
3186	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3187	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3188
3189	/* allocate the rx shadow rings */
3190	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3191	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3192
3193	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3194	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3195
3196	/* allocate the rx host info rings */
3197	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3198	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3199
3200	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3201	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3202
3203	/* allocate the rx busdma resources */
3204	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3205				 1,			/* alignment */
3206				 4096,			/* boundary */
3207				 BUS_SPACE_MAXADDR,	/* low */
3208				 BUS_SPACE_MAXADDR,	/* high */
3209				 NULL, NULL,		/* filter */
3210				 MHLEN,			/* maxsize */
3211				 1,			/* num segs */
3212				 MHLEN,			/* maxsegsize */
3213				 BUS_DMA_ALLOCNOW,	/* flags */
3214				 NULL, NULL,		/* lock */
3215				 &ss->rx_small.dmat);	/* tag */
3216	if (err != 0) {
3217		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3218			      err);
3219		return err;
3220	}
3221
3222	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3223				 1,			/* alignment */
3224				 0,			/* boundary */
3225				 BUS_SPACE_MAXADDR,	/* low */
3226				 BUS_SPACE_MAXADDR,	/* high */
3227				 NULL, NULL,		/* filter */
3228				 3*4096,		/* maxsize */
3229				 1,			/* num segs */
3230				 MJUM9BYTES,		/* maxsegsize*/
3231				 BUS_DMA_ALLOCNOW,	/* flags */
3232				 NULL, NULL,		/* lock */
3233				 &ss->rx_big.dmat);	/* tag */
3234	if (err != 0) {
3235		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3236			      err);
3237		return err;
3238	}
3239	for (i = 0; i <= ss->rx_small.mask; i++) {
3240		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3241					&ss->rx_small.info[i].map);
3242		if (err != 0) {
3243			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3244				      err);
3245			return err;
3246		}
3247	}
3248	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3249				&ss->rx_small.extra_map);
3250	if (err != 0) {
3251		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3252			      err);
3253		return err;
3254	}
3255
3256	for (i = 0; i <= ss->rx_big.mask; i++) {
3257		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3258					&ss->rx_big.info[i].map);
3259		if (err != 0) {
3260			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3261				      err);
3262			return err;
3263		}
3264	}
3265	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3266				&ss->rx_big.extra_map);
3267	if (err != 0) {
3268		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3269			      err);
3270		return err;
3271	}
3272
3273	/* now allocate TX resources */
3274
3275	ss->tx.mask = tx_ring_entries - 1;
3276	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3277
3278	/* allocate the tx request copy block */
3279	bytes = 8 +
3280		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3281	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3282	/* ensure req_list entries are aligned to 8 bytes */
3283	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3284		((uintptr_t)(ss->tx.req_bytes + 7) & ~7UL);
3285
3286	/* allocate the tx busdma segment list */
3287	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3288	ss->tx.seg_list = (bus_dma_segment_t *)
3289		malloc(bytes, M_DEVBUF, M_WAITOK);
3290
3291	/* allocate the tx host info ring */
3292	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3293	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3294
3295	/* allocate the tx busdma resources */
3296	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3297				 1,			/* alignment */
3298				 sc->tx_boundary,	/* boundary */
3299				 BUS_SPACE_MAXADDR,	/* low */
3300				 BUS_SPACE_MAXADDR,	/* high */
3301				 NULL, NULL,		/* filter */
3302				 65536 + 256,		/* maxsize */
3303				 ss->tx.max_desc - 2,	/* num segs */
3304				 sc->tx_boundary,	/* maxsegsz */
3305				 BUS_DMA_ALLOCNOW,	/* flags */
3306				 NULL, NULL,		/* lock */
3307				 &ss->tx.dmat);		/* tag */
3308
3309	if (err != 0) {
3310		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3311			      err);
3312		return err;
3313	}
3314
3315	/* now use these tags to setup dmamaps for each slot
3316	   in the ring */
3317	for (i = 0; i <= ss->tx.mask; i++) {
3318		err = bus_dmamap_create(ss->tx.dmat, 0,
3319					&ss->tx.info[i].map);
3320		if (err != 0) {
3321			device_printf(sc->dev, "Err %d  tx dmamap\n",
3322				      err);
3323			return err;
3324		}
3325	}
3326	return 0;
3327
3328}
3329
3330static int
3331mxge_alloc_rings(mxge_softc_t *sc)
3332{
3333	mxge_cmd_t cmd;
3334	int tx_ring_size;
3335	int tx_ring_entries, rx_ring_entries;
3336	int err, slice;
3337
3338	/* get ring sizes */
3339	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3340	tx_ring_size = cmd.data0;
3341	if (err != 0) {
3342		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3343		goto abort;
3344	}
3345
3346	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3347	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3348	if_setsendqlen(sc->ifp, tx_ring_entries - 1);
3349	if_setsendqready(sc->ifp);
3350
3351	for (slice = 0; slice < sc->num_slices; slice++) {
3352		err = mxge_alloc_slice_rings(&sc->ss[slice],
3353					     rx_ring_entries,
3354					     tx_ring_entries);
3355		if (err != 0)
3356			goto abort;
3357	}
3358	return 0;
3359
3360abort:
3361	mxge_free_rings(sc);
3362	return err;
3363
3364}
3365
3366static void
3367mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3368{
3369	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3370
3371	if (bufsize < MCLBYTES) {
3372		/* easy, everything fits in a single buffer */
3373		*big_buf_size = MCLBYTES;
3374		*cl_size = MCLBYTES;
3375		*nbufs = 1;
3376		return;
3377	}
3378
3379	if (bufsize < MJUMPAGESIZE) {
3380		/* still easy, everything still fits in a single buffer */
3381		*big_buf_size = MJUMPAGESIZE;
3382		*cl_size = MJUMPAGESIZE;
3383		*nbufs = 1;
3384		return;
3385	}
3386	*cl_size = MJUM9BYTES;
3387	*big_buf_size = MJUM9BYTES;
3388	*nbufs = 1;
3389}
3390
3391static int
3392mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3393{
3394	mxge_softc_t *sc;
3395	mxge_cmd_t cmd;
3396	bus_dmamap_t map;
3397	int err, i, slice;
3398
3399	sc = ss->sc;
3400	slice = ss - sc->ss;
3401
3402#if defined(INET) || defined(INET6)
3403	(void)tcp_lro_init(&ss->lc);
3404#endif
3405	ss->lc.ifp = sc->ifp;
3406
3407	/* get the lanai pointers to the send and receive rings */
3408
3409	err = 0;
3410
3411	cmd.data0 = slice;
3412	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3413	ss->tx.lanai =
3414		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3415	ss->tx.send_go = (volatile uint32_t *)
3416		(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3417	ss->tx.send_stop = (volatile uint32_t *)
3418	(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3419
3420	cmd.data0 = slice;
3421	err |= mxge_send_cmd(sc,
3422			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3423	ss->rx_small.lanai =
3424		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3425	cmd.data0 = slice;
3426	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3427	ss->rx_big.lanai =
3428		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3429
3430	if (err != 0) {
3431		device_printf(sc->dev,
3432			      "failed to get ring sizes or locations\n");
3433		return EIO;
3434	}
3435
3436	/* stock receive rings */
3437	for (i = 0; i <= ss->rx_small.mask; i++) {
3438		map = ss->rx_small.info[i].map;
3439		err = mxge_get_buf_small(ss, map, i);
3440		if (err) {
3441			device_printf(sc->dev, "alloced %d/%d smalls\n",
3442				      i, ss->rx_small.mask + 1);
3443			return ENOMEM;
3444		}
3445	}
3446	for (i = 0; i <= ss->rx_big.mask; i++) {
3447		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3448		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3449	}
3450	ss->rx_big.nbufs = nbufs;
3451	ss->rx_big.cl_size = cl_size;
3452	ss->rx_big.mlen = if_getmtu(ss->sc->ifp) + ETHER_HDR_LEN +
3453		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3454	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3455		map = ss->rx_big.info[i].map;
3456		err = mxge_get_buf_big(ss, map, i);
3457		if (err) {
3458			device_printf(sc->dev, "alloced %d/%d bigs\n",
3459				      i, ss->rx_big.mask + 1);
3460			return ENOMEM;
3461		}
3462	}
3463	return 0;
3464}
3465
3466static int
3467mxge_open(mxge_softc_t *sc)
3468{
3469	mxge_cmd_t cmd;
3470	int err, big_bytes, nbufs, slice, cl_size, i;
3471	bus_addr_t bus;
3472	volatile uint8_t *itable;
3473	struct mxge_slice_state *ss;
3474
3475	/* Copy the MAC address in case it was overridden */
3476	bcopy(if_getlladdr(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3477
3478	err = mxge_reset(sc, 1);
3479	if (err != 0) {
3480		device_printf(sc->dev, "failed to reset\n");
3481		return EIO;
3482	}
3483
3484	if (sc->num_slices > 1) {
3485		/* setup the indirection table */
3486		cmd.data0 = sc->num_slices;
3487		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3488				    &cmd);
3489
3490		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3491				     &cmd);
3492		if (err != 0) {
3493			device_printf(sc->dev,
3494				      "failed to setup rss tables\n");
3495			return err;
3496		}
3497
3498		/* just enable an identity mapping */
3499		itable = sc->sram + cmd.data0;
3500		for (i = 0; i < sc->num_slices; i++)
3501			itable[i] = (uint8_t)i;
3502
3503		cmd.data0 = 1;
3504		cmd.data1 = mxge_rss_hash_type;
3505		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3506		if (err != 0) {
3507			device_printf(sc->dev, "failed to enable slices\n");
3508			return err;
3509		}
3510	}
3511
3512	mxge_choose_params(if_getmtu(sc->ifp), &big_bytes, &cl_size, &nbufs);
3513
3514	cmd.data0 = nbufs;
3515	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3516			    &cmd);
3517	/* error is only meaningful if we're trying to set
3518	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3519	if (err && nbufs > 1) {
3520		device_printf(sc->dev,
3521			      "Failed to set alway-use-n to %d\n",
3522			      nbufs);
3523		return EIO;
3524	}
3525	/* Give the firmware the mtu and the big and small buffer
3526	   sizes.  The firmware wants the big buf size to be a power
3527	   of two. Luckily, FreeBSD's clusters are powers of two */
3528	cmd.data0 = if_getmtu(sc->ifp) + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3529	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3530	cmd.data0 = MHLEN - MXGEFW_PAD;
3531	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3532			     &cmd);
3533	cmd.data0 = big_bytes;
3534	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3535
3536	if (err != 0) {
3537		device_printf(sc->dev, "failed to setup params\n");
3538		goto abort;
3539	}
3540
3541	/* Now give him the pointer to the stats block */
3542	for (slice = 0; slice < sc->num_slices; slice++) {
3543		ss = &sc->ss[slice];
3544		cmd.data0 =
3545			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3546		cmd.data1 =
3547			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3548		cmd.data2 = sizeof(struct mcp_irq_data);
3549		cmd.data2 |= (slice << 16);
3550		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3551	}
3552
3553	if (err != 0) {
3554		bus = sc->ss->fw_stats_dma.bus_addr;
3555		bus += offsetof(struct mcp_irq_data, send_done_count);
3556		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3557		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3558		err = mxge_send_cmd(sc,
3559				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3560				    &cmd);
3561		/* Firmware cannot support multicast without STATS_DMA_V2 */
3562		sc->fw_multicast_support = 0;
3563	} else {
3564		sc->fw_multicast_support = 1;
3565	}
3566
3567	if (err != 0) {
3568		device_printf(sc->dev, "failed to setup params\n");
3569		goto abort;
3570	}
3571
3572	for (slice = 0; slice < sc->num_slices; slice++) {
3573		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3574		if (err != 0) {
3575			device_printf(sc->dev, "couldn't open slice %d\n",
3576				      slice);
3577			goto abort;
3578		}
3579	}
3580
3581	/* Finally, start the firmware running */
3582	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3583	if (err) {
3584		device_printf(sc->dev, "Couldn't bring up link\n");
3585		goto abort;
3586	}
3587	for (slice = 0; slice < sc->num_slices; slice++) {
3588		ss = &sc->ss[slice];
3589		ss->if_drv_flags |= IFF_DRV_RUNNING;
3590		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3591	}
3592	if_setdrvflagbits(sc->ifp, IFF_DRV_RUNNING, 0);
3593	if_setdrvflagbits(sc->ifp, 0, IFF_DRV_OACTIVE);
3594
3595	return 0;
3596
3597abort:
3598	mxge_free_mbufs(sc);
3599
3600	return err;
3601}
3602
3603static int
3604mxge_close(mxge_softc_t *sc, int down)
3605{
3606	mxge_cmd_t cmd;
3607	int err, old_down_cnt;
3608	struct mxge_slice_state *ss;
3609	int slice;
3610
3611	for (slice = 0; slice < sc->num_slices; slice++) {
3612		ss = &sc->ss[slice];
3613		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3614	}
3615	if_setdrvflagbits(sc->ifp, 0, IFF_DRV_RUNNING);
3616	if (!down) {
3617		old_down_cnt = sc->down_cnt;
3618		wmb();
3619		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3620		if (err) {
3621			device_printf(sc->dev,
3622				      "Couldn't bring down link\n");
3623		}
3624		if (old_down_cnt == sc->down_cnt) {
3625			/* wait for down irq */
3626			DELAY(10 * sc->intr_coal_delay);
3627		}
3628		wmb();
3629		if (old_down_cnt == sc->down_cnt) {
3630			device_printf(sc->dev, "never got down irq\n");
3631		}
3632	}
3633	mxge_free_mbufs(sc);
3634
3635	return 0;
3636}
3637
3638static void
3639mxge_setup_cfg_space(mxge_softc_t *sc)
3640{
3641	device_t dev = sc->dev;
3642	int reg;
3643	uint16_t lnk, pectl;
3644
3645	/* find the PCIe link width and set max read request to 4KB*/
3646	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3647		lnk = pci_read_config(dev, reg + 0x12, 2);
3648		sc->link_width = (lnk >> 4) & 0x3f;
3649
3650		if (sc->pectl == 0) {
3651			pectl = pci_read_config(dev, reg + 0x8, 2);
3652			pectl = (pectl & ~0x7000) | (5 << 12);
3653			pci_write_config(dev, reg + 0x8, pectl, 2);
3654			sc->pectl = pectl;
3655		} else {
3656			/* restore saved pectl after watchdog reset */
3657			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3658		}
3659	}
3660
3661	/* Enable DMA and Memory space access */
3662	pci_enable_busmaster(dev);
3663}
3664
3665static uint32_t
3666mxge_read_reboot(mxge_softc_t *sc)
3667{
3668	device_t dev = sc->dev;
3669	uint32_t vs;
3670
3671	/* find the vendor specific offset */
3672	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3673		device_printf(sc->dev,
3674			      "could not find vendor specific offset\n");
3675		return (uint32_t)-1;
3676	}
3677	/* enable read32 mode */
3678	pci_write_config(dev, vs + 0x10, 0x3, 1);
3679	/* tell NIC which register to read */
3680	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3681	return (pci_read_config(dev, vs + 0x14, 4));
3682}
3683
3684static void
3685mxge_watchdog_reset(mxge_softc_t *sc)
3686{
3687	struct pci_devinfo *dinfo;
3688	struct mxge_slice_state *ss;
3689	int err, running, s, num_tx_slices = 1;
3690	uint32_t reboot;
3691	uint16_t cmd;
3692
3693	err = ENXIO;
3694
3695	device_printf(sc->dev, "Watchdog reset!\n");
3696
3697	/*
3698	 * check to see if the NIC rebooted.  If it did, then all of
3699	 * PCI config space has been reset, and things like the
3700	 * busmaster bit will be zero.  If this is the case, then we
3701	 * must restore PCI config space before the NIC can be used
3702	 * again
3703	 */
3704	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3705	if (cmd == 0xffff) {
3706		/*
3707		 * maybe the watchdog caught the NIC rebooting; wait
3708		 * up to 100ms for it to finish.  If it does not come
3709		 * back, then give up
3710		 */
3711		DELAY(1000*100);
3712		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3713		if (cmd == 0xffff) {
3714			device_printf(sc->dev, "NIC disappeared!\n");
3715		}
3716	}
3717	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3718		/* print the reboot status */
3719		reboot = mxge_read_reboot(sc);
3720		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3721			      reboot);
3722		running = if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING;
3723		if (running) {
3724			/*
3725			 * quiesce NIC so that TX routines will not try to
3726			 * xmit after restoration of BAR
3727			 */
3728
3729			/* Mark the link as down */
3730			if (sc->link_state) {
3731				sc->link_state = 0;
3732				if_link_state_change(sc->ifp,
3733						     LINK_STATE_DOWN);
3734			}
3735
3736			num_tx_slices = sc->num_slices;
3737
3738			/* grab all TX locks to ensure no tx  */
3739			for (s = 0; s < num_tx_slices; s++) {
3740				ss = &sc->ss[s];
3741				mtx_lock(&ss->tx.mtx);
3742			}
3743			mxge_close(sc, 1);
3744		}
3745		/* restore PCI configuration space */
3746		dinfo = device_get_ivars(sc->dev);
3747		pci_cfg_restore(sc->dev, dinfo);
3748
3749		/* and redo any changes we made to our config space */
3750		mxge_setup_cfg_space(sc);
3751
3752		/* reload f/w */
3753		err = mxge_load_firmware(sc, 0);
3754		if (err) {
3755			device_printf(sc->dev,
3756				      "Unable to re-load f/w\n");
3757		}
3758		if (running) {
3759			if (!err)
3760				err = mxge_open(sc);
3761			/* release all TX locks */
3762			for (s = 0; s < num_tx_slices; s++) {
3763				ss = &sc->ss[s];
3764				mxge_start_locked(ss);
3765				mtx_unlock(&ss->tx.mtx);
3766			}
3767		}
3768		sc->watchdog_resets++;
3769	} else {
3770		device_printf(sc->dev,
3771			      "NIC did not reboot, not resetting\n");
3772		err = 0;
3773	}
3774	if (err) {
3775		device_printf(sc->dev, "watchdog reset failed\n");
3776	} else {
3777		if (sc->dying == 2)
3778			sc->dying = 0;
3779		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3780	}
3781}
3782
3783static void
3784mxge_watchdog_task(void *arg, int pending)
3785{
3786	mxge_softc_t *sc = arg;
3787
3788	mtx_lock(&sc->driver_mtx);
3789	mxge_watchdog_reset(sc);
3790	mtx_unlock(&sc->driver_mtx);
3791}
3792
3793static void
3794mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3795{
3796	tx = &sc->ss[slice].tx;
3797	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3798	device_printf(sc->dev,
3799		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3800		      tx->req, tx->done, tx->queue_active);
3801	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3802			      tx->activate, tx->deactivate);
3803	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3804		      tx->pkt_done,
3805		      be32toh(sc->ss->fw_stats->send_done_count));
3806}
3807
3808static int
3809mxge_watchdog(mxge_softc_t *sc)
3810{
3811	mxge_tx_ring_t *tx;
3812	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3813	int i, err = 0;
3814
3815	/* see if we have outstanding transmits, which
3816	   have been pending for more than mxge_ticks */
3817	for (i = 0; (i < sc->num_slices) && (err == 0); i++) {
3818		tx = &sc->ss[i].tx;
3819		if (tx->req != tx->done &&
3820		    tx->watchdog_req != tx->watchdog_done &&
3821		    tx->done == tx->watchdog_done) {
3822			/* check for pause blocking before resetting */
3823			if (tx->watchdog_rx_pause == rx_pause) {
3824				mxge_warn_stuck(sc, tx, i);
3825				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3826				return (ENXIO);
3827			}
3828			else
3829				device_printf(sc->dev, "Flow control blocking "
3830					      "xmits, check link partner\n");
3831		}
3832
3833		tx->watchdog_req = tx->req;
3834		tx->watchdog_done = tx->done;
3835		tx->watchdog_rx_pause = rx_pause;
3836	}
3837
3838	if (sc->need_media_probe)
3839		mxge_media_probe(sc);
3840	return (err);
3841}
3842
3843static uint64_t
3844mxge_get_counter(if_t ifp, ift_counter cnt)
3845{
3846	struct mxge_softc *sc;
3847	uint64_t rv;
3848
3849	sc = if_getsoftc(ifp);
3850	rv = 0;
3851
3852	switch (cnt) {
3853	case IFCOUNTER_IPACKETS:
3854		for (int s = 0; s < sc->num_slices; s++)
3855			rv += sc->ss[s].ipackets;
3856		return (rv);
3857	case IFCOUNTER_OPACKETS:
3858		for (int s = 0; s < sc->num_slices; s++)
3859			rv += sc->ss[s].opackets;
3860		return (rv);
3861	case IFCOUNTER_OERRORS:
3862		for (int s = 0; s < sc->num_slices; s++)
3863			rv += sc->ss[s].oerrors;
3864		return (rv);
3865	case IFCOUNTER_OBYTES:
3866		for (int s = 0; s < sc->num_slices; s++)
3867			rv += sc->ss[s].obytes;
3868		return (rv);
3869	case IFCOUNTER_OMCASTS:
3870		for (int s = 0; s < sc->num_slices; s++)
3871			rv += sc->ss[s].omcasts;
3872		return (rv);
3873	case IFCOUNTER_OQDROPS:
3874		for (int s = 0; s < sc->num_slices; s++)
3875			rv += sc->ss[s].tx.br->br_drops;
3876		return (rv);
3877	default:
3878		return (if_get_counter_default(ifp, cnt));
3879	}
3880}
3881
3882static void
3883mxge_tick(void *arg)
3884{
3885	mxge_softc_t *sc = arg;
3886	u_long pkts = 0;
3887	int err = 0;
3888	int running, ticks;
3889	uint16_t cmd;
3890
3891	ticks = mxge_ticks;
3892	running = if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING;
3893	if (running) {
3894		if (!sc->watchdog_countdown) {
3895			err = mxge_watchdog(sc);
3896			sc->watchdog_countdown = 4;
3897		}
3898		sc->watchdog_countdown--;
3899	}
3900	if (pkts == 0) {
3901		/* ensure NIC did not suffer h/w fault while idle */
3902		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3903		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3904			sc->dying = 2;
3905			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3906			err = ENXIO;
3907		}
3908		/* look less often if NIC is idle */
3909		ticks *= 4;
3910	}
3911
3912	if (err == 0)
3913		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3914
3915}
3916
3917static int
3918mxge_media_change(if_t ifp)
3919{
3920	return EINVAL;
3921}
3922
3923static int
3924mxge_change_mtu(mxge_softc_t *sc, int mtu)
3925{
3926	if_t ifp = sc->ifp;
3927	int real_mtu, old_mtu;
3928	int err = 0;
3929
3930	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3931	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3932		return EINVAL;
3933	mtx_lock(&sc->driver_mtx);
3934	old_mtu = if_getmtu(ifp);
3935	if_setmtu(ifp, mtu);
3936	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3937		mxge_close(sc, 0);
3938		err = mxge_open(sc);
3939		if (err != 0) {
3940			if_setmtu(ifp, old_mtu);
3941			mxge_close(sc, 0);
3942			(void) mxge_open(sc);
3943		}
3944	}
3945	mtx_unlock(&sc->driver_mtx);
3946	return err;
3947}
3948
3949static void
3950mxge_media_status(if_t ifp, struct ifmediareq *ifmr)
3951{
3952	mxge_softc_t *sc = if_getsoftc(ifp);
3953
3954	if (sc == NULL)
3955		return;
3956	ifmr->ifm_status = IFM_AVALID;
3957	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
3958	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3959	ifmr->ifm_active |= sc->current_media;
3960}
3961
3962static int
3963mxge_fetch_i2c(mxge_softc_t *sc, struct ifi2creq *i2c)
3964{
3965	mxge_cmd_t cmd;
3966	uint32_t i2c_args;
3967	int i, ms, err;
3968
3969	if (i2c->dev_addr != 0xA0 &&
3970	    i2c->dev_addr != 0xA2)
3971		return (EINVAL);
3972	if (i2c->len > sizeof(i2c->data))
3973		return (EINVAL);
3974
3975	for (i = 0; i < i2c->len; i++) {
3976		i2c_args = i2c->dev_addr << 0x8;
3977		i2c_args |= i2c->offset + i;
3978		cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3979		cmd.data1 = i2c_args;
3980		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3981
3982		if (err != MXGEFW_CMD_OK)
3983			return (EIO);
3984		/* now we wait for the data to be cached */
3985		cmd.data0 = i2c_args & 0xff;
3986		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3987		for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3988			cmd.data0 = i2c_args & 0xff;
3989			err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3990			if (err == EBUSY)
3991				DELAY(1000);
3992		}
3993		if (err != MXGEFW_CMD_OK)
3994			return (EIO);
3995		i2c->data[i] = cmd.data0;
3996	}
3997	return (0);
3998}
3999
4000static int
4001mxge_ioctl(if_t ifp, u_long command, caddr_t data)
4002{
4003	mxge_softc_t *sc = if_getsoftc(ifp);
4004	struct ifreq *ifr = (struct ifreq *)data;
4005	struct ifi2creq i2c;
4006	int err, mask;
4007
4008	err = 0;
4009	switch (command) {
4010	case SIOCSIFMTU:
4011		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4012		break;
4013
4014	case SIOCSIFFLAGS:
4015		mtx_lock(&sc->driver_mtx);
4016		if (sc->dying) {
4017			mtx_unlock(&sc->driver_mtx);
4018			return EINVAL;
4019		}
4020		if (if_getflags(ifp) & IFF_UP) {
4021			if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)) {
4022				err = mxge_open(sc);
4023			} else {
4024				/* take care of promis can allmulti
4025				   flag chages */
4026				mxge_change_promisc(sc,
4027						    if_getflags(ifp) & IFF_PROMISC);
4028				mxge_set_multicast_list(sc);
4029			}
4030		} else {
4031			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
4032				mxge_close(sc, 0);
4033			}
4034		}
4035		mtx_unlock(&sc->driver_mtx);
4036		break;
4037
4038	case SIOCADDMULTI:
4039	case SIOCDELMULTI:
4040		mtx_lock(&sc->driver_mtx);
4041		if (sc->dying) {
4042			mtx_unlock(&sc->driver_mtx);
4043			return (EINVAL);
4044		}
4045		mxge_set_multicast_list(sc);
4046		mtx_unlock(&sc->driver_mtx);
4047		break;
4048
4049	case SIOCSIFCAP:
4050		mtx_lock(&sc->driver_mtx);
4051		mask = ifr->ifr_reqcap ^ if_getcapenable(ifp);
4052		if (mask & IFCAP_TXCSUM) {
4053			if (IFCAP_TXCSUM & if_getcapenable(ifp)) {
4054				mask &= ~IFCAP_TSO4;
4055				if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM|IFCAP_TSO4));
4056				if_sethwassistbits(ifp, 0, (CSUM_TCP | CSUM_UDP));
4057			} else {
4058				if_setcapenablebit(ifp, IFCAP_TXCSUM, 0);
4059				if_sethwassistbits(ifp, (CSUM_TCP | CSUM_UDP), 0);
4060			}
4061		}
4062		if (mask & IFCAP_RXCSUM) {
4063			if (IFCAP_RXCSUM & if_getcapenable(ifp)) {
4064				if_setcapenablebit(ifp, 0, IFCAP_RXCSUM);
4065			} else {
4066				if_setcapenablebit(ifp, IFCAP_RXCSUM, 0);
4067			}
4068		}
4069		if (mask & IFCAP_TSO4) {
4070			if (IFCAP_TSO4 & if_getcapenable(ifp)) {
4071				if_setcapenablebit(ifp, 0, IFCAP_TSO4);
4072			} else if (IFCAP_TXCSUM & if_getcapenable(ifp)) {
4073				if_setcapenablebit(ifp, IFCAP_TSO4, 0);
4074				if_sethwassistbits(ifp, CSUM_TSO, 0);
4075			} else {
4076				printf("mxge requires tx checksum offload"
4077				       " be enabled to use TSO\n");
4078				err = EINVAL;
4079			}
4080		}
4081#if IFCAP_TSO6
4082		if (mask & IFCAP_TXCSUM_IPV6) {
4083			if (IFCAP_TXCSUM_IPV6 & if_getcapenable(ifp)) {
4084				mask &= ~IFCAP_TSO6;
4085				if_setcapenablebit(ifp, 0,
4086				    IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
4087				if_sethwassistbits(ifp, 0,
4088				    CSUM_TCP_IPV6 | CSUM_UDP);
4089			} else {
4090				if_setcapenablebit(ifp, IFCAP_TXCSUM_IPV6, 0);
4091				if_sethwassistbits(ifp,
4092				    CSUM_TCP_IPV6 | CSUM_UDP_IPV6, 0);
4093			}
4094		}
4095		if (mask & IFCAP_RXCSUM_IPV6) {
4096			if (IFCAP_RXCSUM_IPV6 & if_getcapenable(ifp)) {
4097				if_setcapenablebit(ifp, 0, IFCAP_RXCSUM_IPV6);
4098			} else {
4099				if_setcapenablebit(ifp, IFCAP_RXCSUM_IPV6, 0);
4100			}
4101		}
4102		if (mask & IFCAP_TSO6) {
4103			if (IFCAP_TSO6 & if_getcapenable(ifp)) {
4104				if_setcapenablebit(ifp, 0, IFCAP_TSO6);
4105			} else if (IFCAP_TXCSUM_IPV6 & if_getcapenable(ifp)) {
4106				if_setcapenablebit(ifp, IFCAP_TSO6, 0);
4107				if_sethwassistbits(ifp, CSUM_TSO, 0);
4108			} else {
4109				printf("mxge requires tx checksum offload"
4110				       " be enabled to use TSO\n");
4111				err = EINVAL;
4112			}
4113		}
4114#endif /*IFCAP_TSO6 */
4115
4116		if (mask & IFCAP_LRO)
4117			if_togglecapenable(ifp, IFCAP_LRO);
4118		if (mask & IFCAP_VLAN_HWTAGGING)
4119			if_togglecapenable(ifp, IFCAP_VLAN_HWTAGGING);
4120		if (mask & IFCAP_VLAN_HWTSO)
4121			if_togglecapenable(ifp, IFCAP_VLAN_HWTSO);
4122
4123		if (!(if_getcapabilities(ifp) & IFCAP_VLAN_HWTSO) ||
4124		    !(if_getcapenable(ifp) & IFCAP_VLAN_HWTAGGING))
4125			if_setcapenablebit(ifp, 0, IFCAP_VLAN_HWTSO);
4126
4127		mtx_unlock(&sc->driver_mtx);
4128		VLAN_CAPABILITIES(ifp);
4129
4130		break;
4131
4132	case SIOCGIFMEDIA:
4133		mtx_lock(&sc->driver_mtx);
4134		if (sc->dying) {
4135			mtx_unlock(&sc->driver_mtx);
4136			return (EINVAL);
4137		}
4138		mxge_media_probe(sc);
4139		mtx_unlock(&sc->driver_mtx);
4140		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4141				    &sc->media, command);
4142		break;
4143
4144	case SIOCGI2C:
4145		if (sc->connector != MXGE_XFP &&
4146		    sc->connector != MXGE_SFP) {
4147			err = ENXIO;
4148			break;
4149		}
4150		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4151		if (err != 0)
4152			break;
4153		mtx_lock(&sc->driver_mtx);
4154		if (sc->dying) {
4155			mtx_unlock(&sc->driver_mtx);
4156			return (EINVAL);
4157		}
4158		err = mxge_fetch_i2c(sc, &i2c);
4159		mtx_unlock(&sc->driver_mtx);
4160		if (err == 0)
4161			err = copyout(&i2c, ifr_data_get_ptr(ifr),
4162			    sizeof(i2c));
4163		break;
4164	default:
4165		err = ether_ioctl(ifp, command, data);
4166		break;
4167	}
4168	return err;
4169}
4170
4171static void
4172mxge_fetch_tunables(mxge_softc_t *sc)
4173{
4174
4175	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4176	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4177			  &mxge_flow_control);
4178	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4179			  &mxge_intr_coal_delay);
4180	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4181			  &mxge_nvidia_ecrc_enable);
4182	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4183			  &mxge_force_firmware);
4184	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4185			  &mxge_deassert_wait);
4186	TUNABLE_INT_FETCH("hw.mxge.verbose",
4187			  &mxge_verbose);
4188	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4189	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4190	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4191	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4192	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4193	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4194
4195	if (bootverbose)
4196		mxge_verbose = 1;
4197	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4198		mxge_intr_coal_delay = 30;
4199	if (mxge_ticks == 0)
4200		mxge_ticks = hz / 2;
4201	sc->pause = mxge_flow_control;
4202	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4203	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4204		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4205	}
4206	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4207	    mxge_initial_mtu < ETHER_MIN_LEN)
4208		mxge_initial_mtu = ETHERMTU_JUMBO;
4209
4210	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4211		mxge_throttle = MXGE_MAX_THROTTLE;
4212	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4213		mxge_throttle = MXGE_MIN_THROTTLE;
4214	sc->throttle = mxge_throttle;
4215}
4216
4217static void
4218mxge_free_slices(mxge_softc_t *sc)
4219{
4220	struct mxge_slice_state *ss;
4221	int i;
4222
4223	if (sc->ss == NULL)
4224		return;
4225
4226	for (i = 0; i < sc->num_slices; i++) {
4227		ss = &sc->ss[i];
4228		if (ss->fw_stats != NULL) {
4229			mxge_dma_free(&ss->fw_stats_dma);
4230			ss->fw_stats = NULL;
4231			if (ss->tx.br != NULL) {
4232				drbr_free(ss->tx.br, M_DEVBUF);
4233				ss->tx.br = NULL;
4234			}
4235			mtx_destroy(&ss->tx.mtx);
4236		}
4237		if (ss->rx_done.entry != NULL) {
4238			mxge_dma_free(&ss->rx_done.dma);
4239			ss->rx_done.entry = NULL;
4240		}
4241	}
4242	free(sc->ss, M_DEVBUF);
4243	sc->ss = NULL;
4244}
4245
4246static int
4247mxge_alloc_slices(mxge_softc_t *sc)
4248{
4249	mxge_cmd_t cmd;
4250	struct mxge_slice_state *ss;
4251	size_t bytes;
4252	int err, i, max_intr_slots;
4253
4254	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4255	if (err != 0) {
4256		device_printf(sc->dev, "Cannot determine rx ring size\n");
4257		return err;
4258	}
4259	sc->rx_ring_size = cmd.data0;
4260	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4261
4262	bytes = sizeof (*sc->ss) * sc->num_slices;
4263	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4264	if (sc->ss == NULL)
4265		return (ENOMEM);
4266	for (i = 0; i < sc->num_slices; i++) {
4267		ss = &sc->ss[i];
4268
4269		ss->sc = sc;
4270
4271		/* allocate per-slice rx interrupt queues */
4272
4273		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4274		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4275		if (err != 0)
4276			goto abort;
4277		ss->rx_done.entry = ss->rx_done.dma.addr;
4278		bzero(ss->rx_done.entry, bytes);
4279
4280		/*
4281		 * allocate the per-slice firmware stats; stats
4282		 * (including tx) are used used only on the first
4283		 * slice for now
4284		 */
4285
4286		bytes = sizeof (*ss->fw_stats);
4287		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4288				     sizeof (*ss->fw_stats), 64);
4289		if (err != 0)
4290			goto abort;
4291		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4292		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4293			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4294		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4295		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4296					   &ss->tx.mtx);
4297	}
4298
4299	return (0);
4300
4301abort:
4302	mxge_free_slices(sc);
4303	return (ENOMEM);
4304}
4305
4306static void
4307mxge_slice_probe(mxge_softc_t *sc)
4308{
4309	mxge_cmd_t cmd;
4310	char *old_fw;
4311	int msix_cnt, status, max_intr_slots;
4312
4313	sc->num_slices = 1;
4314	/*
4315	 *  don't enable multiple slices if they are not enabled,
4316	 *  or if this is not an SMP system
4317	 */
4318
4319	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4320		return;
4321
4322	/* see how many MSI-X interrupts are available */
4323	msix_cnt = pci_msix_count(sc->dev);
4324	if (msix_cnt < 2)
4325		return;
4326
4327	/* now load the slice aware firmware see what it supports */
4328	old_fw = sc->fw_name;
4329	if (old_fw == mxge_fw_aligned)
4330		sc->fw_name = mxge_fw_rss_aligned;
4331	else
4332		sc->fw_name = mxge_fw_rss_unaligned;
4333	status = mxge_load_firmware(sc, 0);
4334	if (status != 0) {
4335		device_printf(sc->dev, "Falling back to a single slice\n");
4336		return;
4337	}
4338
4339	/* try to send a reset command to the card to see if it
4340	   is alive */
4341	memset(&cmd, 0, sizeof (cmd));
4342	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4343	if (status != 0) {
4344		device_printf(sc->dev, "failed reset\n");
4345		goto abort_with_fw;
4346	}
4347
4348	/* get rx ring size */
4349	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4350	if (status != 0) {
4351		device_printf(sc->dev, "Cannot determine rx ring size\n");
4352		goto abort_with_fw;
4353	}
4354	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4355
4356	/* tell it the size of the interrupt queues */
4357	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4358	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4359	if (status != 0) {
4360		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4361		goto abort_with_fw;
4362	}
4363
4364	/* ask the maximum number of slices it supports */
4365	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4366	if (status != 0) {
4367		device_printf(sc->dev,
4368			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4369		goto abort_with_fw;
4370	}
4371	sc->num_slices = cmd.data0;
4372	if (sc->num_slices > msix_cnt)
4373		sc->num_slices = msix_cnt;
4374
4375	if (mxge_max_slices == -1) {
4376		/* cap to number of CPUs in system */
4377		if (sc->num_slices > mp_ncpus)
4378			sc->num_slices = mp_ncpus;
4379	} else {
4380		if (sc->num_slices > mxge_max_slices)
4381			sc->num_slices = mxge_max_slices;
4382	}
4383	/* make sure it is a power of two */
4384	while (sc->num_slices & (sc->num_slices - 1))
4385		sc->num_slices--;
4386
4387	if (mxge_verbose)
4388		device_printf(sc->dev, "using %d slices\n",
4389			      sc->num_slices);
4390
4391	return;
4392
4393abort_with_fw:
4394	sc->fw_name = old_fw;
4395	(void) mxge_load_firmware(sc, 0);
4396}
4397
4398static int
4399mxge_add_msix_irqs(mxge_softc_t *sc)
4400{
4401	size_t bytes;
4402	int count, err, i, rid;
4403
4404	rid = PCIR_BAR(2);
4405	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4406						    &rid, RF_ACTIVE);
4407
4408	if (sc->msix_table_res == NULL) {
4409		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4410		return ENXIO;
4411	}
4412
4413	count = sc->num_slices;
4414	err = pci_alloc_msix(sc->dev, &count);
4415	if (err != 0) {
4416		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4417			      "err = %d \n", sc->num_slices, err);
4418		goto abort_with_msix_table;
4419	}
4420	if (count < sc->num_slices) {
4421		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4422			      count, sc->num_slices);
4423		device_printf(sc->dev,
4424			      "Try setting hw.mxge.max_slices to %d\n",
4425			      count);
4426		err = ENOSPC;
4427		goto abort_with_msix;
4428	}
4429	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4430	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4431	if (sc->msix_irq_res == NULL) {
4432		err = ENOMEM;
4433		goto abort_with_msix;
4434	}
4435
4436	for (i = 0; i < sc->num_slices; i++) {
4437		rid = i + 1;
4438		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4439							  SYS_RES_IRQ,
4440							  &rid, RF_ACTIVE);
4441		if (sc->msix_irq_res[i] == NULL) {
4442			device_printf(sc->dev, "couldn't allocate IRQ res"
4443				      " for message %d\n", i);
4444			err = ENXIO;
4445			goto abort_with_res;
4446		}
4447	}
4448
4449	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4450	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4451
4452	for (i = 0; i < sc->num_slices; i++) {
4453		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4454				     INTR_TYPE_NET | INTR_MPSAFE, NULL,
4455				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4456		if (err != 0) {
4457			device_printf(sc->dev, "couldn't setup intr for "
4458				      "message %d\n", i);
4459			goto abort_with_intr;
4460		}
4461		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4462				  sc->msix_ih[i], "s%d", i);
4463	}
4464
4465	if (mxge_verbose) {
4466		device_printf(sc->dev, "using %d msix IRQs:",
4467			      sc->num_slices);
4468		for (i = 0; i < sc->num_slices; i++)
4469			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4470		printf("\n");
4471	}
4472	return (0);
4473
4474abort_with_intr:
4475	for (i = 0; i < sc->num_slices; i++) {
4476		if (sc->msix_ih[i] != NULL) {
4477			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4478					  sc->msix_ih[i]);
4479			sc->msix_ih[i] = NULL;
4480		}
4481	}
4482	free(sc->msix_ih, M_DEVBUF);
4483
4484abort_with_res:
4485	for (i = 0; i < sc->num_slices; i++) {
4486		rid = i + 1;
4487		if (sc->msix_irq_res[i] != NULL)
4488			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4489					     sc->msix_irq_res[i]);
4490		sc->msix_irq_res[i] = NULL;
4491	}
4492	free(sc->msix_irq_res, M_DEVBUF);
4493
4494abort_with_msix:
4495	pci_release_msi(sc->dev);
4496
4497abort_with_msix_table:
4498	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4499			     sc->msix_table_res);
4500
4501	return err;
4502}
4503
4504static int
4505mxge_add_single_irq(mxge_softc_t *sc)
4506{
4507	int count, err, rid;
4508
4509	count = pci_msi_count(sc->dev);
4510	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4511		rid = 1;
4512	} else {
4513		rid = 0;
4514		sc->legacy_irq = 1;
4515	}
4516	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4517					     RF_SHAREABLE | RF_ACTIVE);
4518	if (sc->irq_res == NULL) {
4519		device_printf(sc->dev, "could not alloc interrupt\n");
4520		return ENXIO;
4521	}
4522	if (mxge_verbose)
4523		device_printf(sc->dev, "using %s irq %jd\n",
4524			      sc->legacy_irq ? "INTx" : "MSI",
4525			      rman_get_start(sc->irq_res));
4526	err = bus_setup_intr(sc->dev, sc->irq_res,
4527			     INTR_TYPE_NET | INTR_MPSAFE, NULL,
4528			     mxge_intr, &sc->ss[0], &sc->ih);
4529	if (err != 0) {
4530		bus_release_resource(sc->dev, SYS_RES_IRQ,
4531				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4532		if (!sc->legacy_irq)
4533			pci_release_msi(sc->dev);
4534	}
4535	return err;
4536}
4537
4538static void
4539mxge_rem_msix_irqs(mxge_softc_t *sc)
4540{
4541	int i, rid;
4542
4543	for (i = 0; i < sc->num_slices; i++) {
4544		if (sc->msix_ih[i] != NULL) {
4545			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4546					  sc->msix_ih[i]);
4547			sc->msix_ih[i] = NULL;
4548		}
4549	}
4550	free(sc->msix_ih, M_DEVBUF);
4551
4552	for (i = 0; i < sc->num_slices; i++) {
4553		rid = i + 1;
4554		if (sc->msix_irq_res[i] != NULL)
4555			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4556					     sc->msix_irq_res[i]);
4557		sc->msix_irq_res[i] = NULL;
4558	}
4559	free(sc->msix_irq_res, M_DEVBUF);
4560
4561	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4562			     sc->msix_table_res);
4563
4564	pci_release_msi(sc->dev);
4565	return;
4566}
4567
4568static void
4569mxge_rem_single_irq(mxge_softc_t *sc)
4570{
4571	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4572	bus_release_resource(sc->dev, SYS_RES_IRQ,
4573			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4574	if (!sc->legacy_irq)
4575		pci_release_msi(sc->dev);
4576}
4577
4578static void
4579mxge_rem_irq(mxge_softc_t *sc)
4580{
4581	if (sc->num_slices > 1)
4582		mxge_rem_msix_irqs(sc);
4583	else
4584		mxge_rem_single_irq(sc);
4585}
4586
4587static int
4588mxge_add_irq(mxge_softc_t *sc)
4589{
4590	int err;
4591
4592	if (sc->num_slices > 1)
4593		err = mxge_add_msix_irqs(sc);
4594	else
4595		err = mxge_add_single_irq(sc);
4596
4597	if (0 && err == 0 && sc->num_slices > 1) {
4598		mxge_rem_msix_irqs(sc);
4599		err = mxge_add_msix_irqs(sc);
4600	}
4601	return err;
4602}
4603
4604static int
4605mxge_attach(device_t dev)
4606{
4607	mxge_cmd_t cmd;
4608	mxge_softc_t *sc = device_get_softc(dev);
4609	if_t ifp;
4610	int err, rid;
4611
4612	sc->dev = dev;
4613	mxge_fetch_tunables(sc);
4614
4615	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4616	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4617				  taskqueue_thread_enqueue, &sc->tq);
4618	if (sc->tq == NULL) {
4619		err = ENOMEM;
4620		goto abort_with_nothing;
4621	}
4622
4623	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4624				 1,			/* alignment */
4625				 0,			/* boundary */
4626				 BUS_SPACE_MAXADDR,	/* low */
4627				 BUS_SPACE_MAXADDR,	/* high */
4628				 NULL, NULL,		/* filter */
4629				 65536 + 256,		/* maxsize */
4630				 MXGE_MAX_SEND_DESC, 	/* num segs */
4631				 65536,			/* maxsegsize */
4632				 0,			/* flags */
4633				 NULL, NULL,		/* lock */
4634				 &sc->parent_dmat);	/* tag */
4635
4636	if (err != 0) {
4637		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4638			      err);
4639		goto abort_with_tq;
4640	}
4641
4642	ifp = sc->ifp = if_alloc(IFT_ETHER);
4643	if (ifp == NULL) {
4644		device_printf(dev, "can not if_alloc()\n");
4645		err = ENOSPC;
4646		goto abort_with_parent_dmat;
4647	}
4648	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4649
4650	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4651		 device_get_nameunit(dev));
4652	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4653	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4654		 "%s:drv", device_get_nameunit(dev));
4655	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4656		 MTX_NETWORK_LOCK, MTX_DEF);
4657
4658	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4659
4660	mxge_setup_cfg_space(sc);
4661
4662	/* Map the board into the kernel */
4663	rid = PCIR_BARS;
4664	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4665					     RF_ACTIVE);
4666	if (sc->mem_res == NULL) {
4667		device_printf(dev, "could not map memory\n");
4668		err = ENXIO;
4669		goto abort_with_lock;
4670	}
4671	sc->sram = rman_get_virtual(sc->mem_res);
4672	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4673	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4674		device_printf(dev, "impossible memory region size %jd\n",
4675			      rman_get_size(sc->mem_res));
4676		err = ENXIO;
4677		goto abort_with_mem_res;
4678	}
4679
4680	/* make NULL terminated copy of the EEPROM strings section of
4681	   lanai SRAM */
4682	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4683	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4684				rman_get_bushandle(sc->mem_res),
4685				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4686				sc->eeprom_strings,
4687				MXGE_EEPROM_STRINGS_SIZE - 2);
4688	err = mxge_parse_strings(sc);
4689	if (err != 0)
4690		goto abort_with_mem_res;
4691
4692	/* Enable write combining for efficient use of PCIe bus */
4693	mxge_enable_wc(sc);
4694
4695	/* Allocate the out of band dma memory */
4696	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4697			     sizeof (mxge_cmd_t), 64);
4698	if (err != 0)
4699		goto abort_with_mem_res;
4700	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4701	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4702	if (err != 0)
4703		goto abort_with_cmd_dma;
4704
4705	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4706	if (err != 0)
4707		goto abort_with_zeropad_dma;
4708
4709	/* select & load the firmware */
4710	err = mxge_select_firmware(sc);
4711	if (err != 0)
4712		goto abort_with_dmabench;
4713	sc->intr_coal_delay = mxge_intr_coal_delay;
4714
4715	mxge_slice_probe(sc);
4716	err = mxge_alloc_slices(sc);
4717	if (err != 0)
4718		goto abort_with_dmabench;
4719
4720	err = mxge_reset(sc, 0);
4721	if (err != 0)
4722		goto abort_with_slices;
4723
4724	err = mxge_alloc_rings(sc);
4725	if (err != 0) {
4726		device_printf(sc->dev, "failed to allocate rings\n");
4727		goto abort_with_slices;
4728	}
4729
4730	err = mxge_add_irq(sc);
4731	if (err != 0) {
4732		device_printf(sc->dev, "failed to add irq\n");
4733		goto abort_with_rings;
4734	}
4735
4736	if_setbaudrate(ifp, IF_Gbps(10));
4737	if_setcapabilities(ifp, IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4738		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4739		IFCAP_RXCSUM_IPV6);
4740#if defined(INET) || defined(INET6)
4741	if_setcapabilitiesbit(ifp, IFCAP_LRO, 0);
4742#endif
4743
4744#ifdef MXGE_NEW_VLAN_API
4745	if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM, 0);
4746
4747	/* Only FW 1.4.32 and newer can do TSO over vlans */
4748	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4749	    sc->fw_ver_tiny >= 32)
4750		if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTSO, 0);
4751#endif
4752	sc->max_mtu = mxge_max_mtu(sc);
4753	if (sc->max_mtu >= 9000)
4754		if_setcapabilitiesbit(ifp, IFCAP_JUMBO_MTU, 0);
4755	else
4756		device_printf(dev, "MTU limited to %d.  Install "
4757			      "latest firmware for 9000 byte jumbo support\n",
4758			      sc->max_mtu - ETHER_HDR_LEN);
4759	if_sethwassist(ifp, CSUM_TCP | CSUM_UDP | CSUM_TSO);
4760	if_sethwassistbits(ifp, CSUM_TCP_IPV6 | CSUM_UDP_IPV6, 0);
4761	/* check to see if f/w supports TSO for IPv6 */
4762	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4763		if (CSUM_TCP_IPV6)
4764			if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
4765		sc->max_tso6_hlen = min(cmd.data0,
4766					sizeof (sc->ss[0].scratch));
4767	}
4768	if_setcapenable(ifp, if_getcapabilities(ifp));
4769	if (sc->lro_cnt == 0)
4770		if_setcapenablebit(ifp, 0, IFCAP_LRO);
4771	if_setinitfn(ifp, mxge_init);
4772	if_setsoftc(ifp, sc);
4773	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
4774	if_setioctlfn(ifp, mxge_ioctl);
4775	if_setstartfn(ifp, mxge_start);
4776	if_setgetcounterfn(ifp, mxge_get_counter);
4777	if_sethwtsomax(ifp, IP_MAXPACKET - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
4778	if_sethwtsomaxsegcount(ifp, sc->ss[0].tx.max_desc);
4779	if_sethwtsomaxsegsize(ifp, IP_MAXPACKET);
4780	/* Initialise the ifmedia structure */
4781	ifmedia_init(&sc->media, 0, mxge_media_change,
4782		     mxge_media_status);
4783	mxge_media_init(sc);
4784	mxge_media_probe(sc);
4785	sc->dying = 0;
4786	ether_ifattach(ifp, sc->mac_addr);
4787	/* ether_ifattach sets mtu to ETHERMTU */
4788	if (mxge_initial_mtu != ETHERMTU)
4789		mxge_change_mtu(sc, mxge_initial_mtu);
4790
4791	mxge_add_sysctls(sc);
4792	if_settransmitfn(ifp, mxge_transmit);
4793	if_setqflushfn(ifp, mxge_qflush);
4794	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4795				device_get_nameunit(sc->dev));
4796	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4797	return 0;
4798
4799abort_with_rings:
4800	mxge_free_rings(sc);
4801abort_with_slices:
4802	mxge_free_slices(sc);
4803abort_with_dmabench:
4804	mxge_dma_free(&sc->dmabench_dma);
4805abort_with_zeropad_dma:
4806	mxge_dma_free(&sc->zeropad_dma);
4807abort_with_cmd_dma:
4808	mxge_dma_free(&sc->cmd_dma);
4809abort_with_mem_res:
4810	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4811abort_with_lock:
4812	pci_disable_busmaster(dev);
4813	mtx_destroy(&sc->cmd_mtx);
4814	mtx_destroy(&sc->driver_mtx);
4815	if_free(ifp);
4816abort_with_parent_dmat:
4817	bus_dma_tag_destroy(sc->parent_dmat);
4818abort_with_tq:
4819	if (sc->tq != NULL) {
4820		taskqueue_drain(sc->tq, &sc->watchdog_task);
4821		taskqueue_free(sc->tq);
4822		sc->tq = NULL;
4823	}
4824abort_with_nothing:
4825	return err;
4826}
4827
4828static int
4829mxge_detach(device_t dev)
4830{
4831	mxge_softc_t *sc = device_get_softc(dev);
4832
4833	if (mxge_vlans_active(sc)) {
4834		device_printf(sc->dev,
4835			      "Detach vlans before removing module\n");
4836		return EBUSY;
4837	}
4838	mtx_lock(&sc->driver_mtx);
4839	sc->dying = 1;
4840	if (if_getdrvflags(sc->ifp) & IFF_DRV_RUNNING)
4841		mxge_close(sc, 0);
4842	mtx_unlock(&sc->driver_mtx);
4843	ether_ifdetach(sc->ifp);
4844	if (sc->tq != NULL) {
4845		taskqueue_drain(sc->tq, &sc->watchdog_task);
4846		taskqueue_free(sc->tq);
4847		sc->tq = NULL;
4848	}
4849	callout_drain(&sc->co_hdl);
4850	ifmedia_removeall(&sc->media);
4851	mxge_dummy_rdma(sc, 0);
4852	mxge_rem_sysctls(sc);
4853	mxge_rem_irq(sc);
4854	mxge_free_rings(sc);
4855	mxge_free_slices(sc);
4856	mxge_dma_free(&sc->dmabench_dma);
4857	mxge_dma_free(&sc->zeropad_dma);
4858	mxge_dma_free(&sc->cmd_dma);
4859	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4860	pci_disable_busmaster(dev);
4861	mtx_destroy(&sc->cmd_mtx);
4862	mtx_destroy(&sc->driver_mtx);
4863	if_free(sc->ifp);
4864	bus_dma_tag_destroy(sc->parent_dmat);
4865	return 0;
4866}
4867
4868static int
4869mxge_shutdown(device_t dev)
4870{
4871	return 0;
4872}
4873
4874/*
4875  This file uses Myri10GE driver indentation.
4876
4877  Local Variables:
4878  c-file-style:"linux"
4879  tab-width:8
4880  End:
4881*/
4882