if_mxge.c revision 170733
18097Sjkh/******************************************************************************
28097Sjkh
38097SjkhCopyright (c) 2006, Myricom Inc.
48097SjkhAll rights reserved.
58097Sjkh
68097SjkhRedistribution and use in source and binary forms, with or without
714670Sjkhmodification, are permitted provided that the following conditions are met:
88097Sjkh
98097Sjkh 1. Redistributions of source code must retain the above copyright notice,
108097Sjkh    this list of conditions and the following disclaimer.
118097Sjkh
128097Sjkh 2. Redistributions in binary form must reproduce the above copyright
138097Sjkh    notice, this list of conditions and the following disclaimer in the
148097Sjkh    documentation and/or other materials provided with the distribution.
158097Sjkh
168881Srgrimes 3. Neither the name of the Myricom Inc, nor the names of its
178881Srgrimes    contributors may be used to endorse or promote products derived from
188097Sjkh    this software without specific prior written permission.
198097Sjkh
208097SjkhTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
218097SjkhAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
228097SjkhIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
238097SjkhARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
248097SjkhLIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
258097SjkhCONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
268097SjkhSUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
278097SjkhINTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
288097SjkhCONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
298097SjkhARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
308097SjkhPOSSIBILITY OF SUCH DAMAGE.
318097Sjkh
328097Sjkh***************************************************************************/
338097Sjkh
348097Sjkh#include <sys/cdefs.h>
358097Sjkh__FBSDID("$FreeBSD: head/sys/dev/mxge/if_mxge.c 170733 2007-06-14 19:35:03Z gallatin $");
368097Sjkh
378097Sjkh#include <sys/param.h>
388097Sjkh#include <sys/systm.h>
398097Sjkh#include <sys/linker.h>
408097Sjkh#include <sys/firmware.h>
418097Sjkh#include <sys/endian.h>
428097Sjkh#include <sys/sockio.h>
438097Sjkh#include <sys/mbuf.h>
448097Sjkh#include <sys/malloc.h>
458097Sjkh#include <sys/kdb.h>
468097Sjkh#include <sys/kernel.h>
478097Sjkh#include <sys/lock.h>
488097Sjkh#include <sys/module.h>
498097Sjkh#include <sys/memrange.h>
508097Sjkh#include <sys/socket.h>
518097Sjkh#include <sys/sysctl.h>
5212661Speter#include <sys/sx.h>
5312661Speter
548281Sjkh#include <net/if.h>
558405Sjkh#include <net/if_arp.h>
5612661Speter#include <net/ethernet.h>
578097Sjkh#include <net/if_dl.h>
588208Sjkh#include <net/if_media.h>
598208Sjkh
608174Sjkh#include <net/bpf.h>
618174Sjkh
628174Sjkh#include <net/if_types.h>
638174Sjkh#include <net/if_vlan_var.h>
648174Sjkh#include <net/zlib.h>
6514321Sjkh
6614321Sjkh#include <netinet/in_systm.h>
6714321Sjkh#include <netinet/in.h>
688208Sjkh#include <netinet/ip.h>
698208Sjkh#include <netinet/tcp.h>
708208Sjkh
718208Sjkh#include <machine/bus.h>
728208Sjkh#include <machine/in_cksum.h>
7312661Speter#include <machine/resource.h>
7412661Speter#include <sys/bus.h>
758549Sjkh#include <sys/rman.h>
7612661Speter
778208Sjkh#include <dev/pci/pcireg.h>
7812661Speter#include <dev/pci/pcivar.h>
7912661Speter
8012661Speter#include <vm/vm.h>		/* for pmap_mapdev() */
8112661Speter#include <vm/pmap.h>
828705Sjkh
838705Sjkh#if defined(__i386) || defined(__amd64)
848705Sjkh#include <machine/specialreg.h>
858705Sjkh#endif
868705Sjkh
878705Sjkh#include <dev/mxge/mxge_mcp.h>
888705Sjkh#include <dev/mxge/mcp_gen_header.h>
898705Sjkh#include <dev/mxge/if_mxge_var.h>
9012661Speter
918208Sjkh/* tunable params */
9212661Speterstatic int mxge_nvidia_ecrc_enable = 1;
938549Sjkhstatic int mxge_force_firmware = 0;
948549Sjkhstatic int mxge_intr_coal_delay = 30;
9512661Speterstatic int mxge_deassert_wait = 1;
9612661Speterstatic int mxge_flow_control = 1;
978709Sjkhstatic int mxge_verbose = 0;
988549Sjkhstatic int mxge_lro_cnt = 8;
998549Sjkhstatic int mxge_ticks;
10012661Speterstatic char *mxge_fw_unaligned = "mxge_ethp_z8e";
10112661Speterstatic char *mxge_fw_aligned = "mxge_eth_z8e";
10212661Speter
10312661Speterstatic int mxge_probe(device_t dev);
10412661Speterstatic int mxge_attach(device_t dev);
10512661Speterstatic int mxge_detach(device_t dev);
10612661Speterstatic int mxge_shutdown(device_t dev);
10712661Speterstatic void mxge_intr(void *arg);
10812661Speter
10912661Speterstatic device_method_t mxge_methods[] =
11012661Speter{
11112661Speter  /* Device interface */
11212661Speter  DEVMETHOD(device_probe, mxge_probe),
11312661Speter  DEVMETHOD(device_attach, mxge_attach),
11412661Speter  DEVMETHOD(device_detach, mxge_detach),
11512661Speter  DEVMETHOD(device_shutdown, mxge_shutdown),
11612661Speter  {0, 0}
11712661Speter};
11812661Speter
11912661Speterstatic driver_t mxge_driver =
12012661Speter{
12112661Speter  "mxge",
12212661Speter  mxge_methods,
12312661Speter  sizeof(mxge_softc_t),
12412661Speter};
12512661Speter
12612661Speterstatic devclass_t mxge_devclass;
12712661Speter
12812661Speter/* Declare ourselves to be a child of the PCI bus.*/
12912661SpeterDRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
13012661SpeterMODULE_DEPEND(mxge, firmware, 1, 1, 1);
13112661Speter
13212661Speterstatic int mxge_load_firmware(mxge_softc_t *sc);
13312661Speterstatic int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
13412661Speterstatic int mxge_close(mxge_softc_t *sc);
13512661Speterstatic int mxge_open(mxge_softc_t *sc);
13612661Speterstatic void mxge_tick(void *arg);
13712661Speter
13812661Speterstatic int
13912661Spetermxge_probe(device_t dev)
14012661Speter{
14112661Speter  if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
14212661Speter      (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
14312661Speter	  device_set_desc(dev, "Myri10G-PCIE-8A");
1449202Srgrimes	  return 0;
14512661Speter  }
1469202Srgrimes  return ENXIO;
14712661Speter}
14812661Speter
1498549Sjkhstatic void
1508208Sjkhmxge_enable_wc(mxge_softc_t *sc)
1518097Sjkh{
1528549Sjkh	struct mem_range_desc mrdesc;
1538549Sjkh	vm_paddr_t pa;
1548097Sjkh	vm_offset_t len;
1558097Sjkh	int err, action;
1568174Sjkh
1578174Sjkh	len = rman_get_size(sc->mem_res);
1588174Sjkh#if defined(__i386) || defined(__amd64)
1598174Sjkh	err = pmap_change_attr((vm_offset_t) sc->sram,
1608174Sjkh			       len, PAT_WRITE_COMBINING);
1618405Sjkh	if (err == 0)
1629202Srgrimes		return;
1638174Sjkh	else
1648174Sjkh		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
1658208Sjkh			      err);
1668097Sjkh#endif
1678097Sjkh	pa = rman_get_start(sc->mem_res);
1688097Sjkh	mrdesc.mr_base = pa;
1698174Sjkh	mrdesc.mr_len = len;
1708174Sjkh	mrdesc.mr_flags = MDF_WRITECOMBINE;
1718174Sjkh	action = MEMRANGE_SET_UPDATE;
1728174Sjkh	strcpy((char *)&mrdesc.mr_owner, "mxge");
17312661Speter	err = mem_range_attr_set(&mrdesc, &action);
1748208Sjkh	if (err != 0) {
1759202Srgrimes		device_printf(sc->dev,
1768097Sjkh			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
1778097Sjkh			      (unsigned long)pa, (unsigned long)len, err);
1788097Sjkh	} else {
1798174Sjkh		sc->wc = 1;
1808174Sjkh	}
1818174Sjkh}
1828174Sjkh
1838174Sjkh
1848174Sjkh/* callback to get our DMA address */
1858097Sjkhstatic void
1868097Sjkhmxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
1878097Sjkh			 int error)
1888097Sjkh{
1898097Sjkh	if (error == 0) {
1908208Sjkh		*(bus_addr_t *) arg = segs->ds_addr;
1918208Sjkh	}
1928097Sjkh}
1938097Sjkh
19412661Speterstatic int
1958792Sjkhmxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
19612661Speter		   bus_size_t alignment)
19712661Speter{
1988792Sjkh	int err;
1998792Sjkh	device_t dev = sc->dev;
20012661Speter
20112661Speter	/* allocate DMAable memory tags */
2028792Sjkh	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2038792Sjkh				 alignment,		/* alignment */
2048208Sjkh				 4096,			/* boundary */
2058363Sjkh				 BUS_SPACE_MAXADDR,	/* low */
2068208Sjkh				 BUS_SPACE_MAXADDR,	/* high */
2078208Sjkh				 NULL, NULL,		/* filter */
2088756Sjkh				 bytes,			/* maxsize */
2098208Sjkh				 1,			/* num segs */
2108208Sjkh				 4096,			/* maxsegsize */
2118208Sjkh				 BUS_DMA_COHERENT,	/* flags */
2128642Sjkh				 NULL, NULL,		/* lock */
2138837Sjkh				 &dma->dmat);		/* tag */
2148837Sjkh	if (err != 0) {
2158363Sjkh		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
2168208Sjkh		return err;
2178097Sjkh	}
2188208Sjkh
2198208Sjkh	/* allocate DMAable memory & map */
2208208Sjkh	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
2218556Sjkh			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
2228636Sjkh				| BUS_DMA_ZERO),  &dma->map);
2238208Sjkh	if (err != 0) {
2248549Sjkh		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
2259202Srgrimes		goto abort_with_dmat;
22614321Sjkh	}
2279202Srgrimes
2289202Srgrimes	/* load the memory */
2298556Sjkh	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
2309202Srgrimes			      mxge_dmamap_callback,
2318208Sjkh			      (void *)&dma->bus_addr, 0);
2328208Sjkh	if (err != 0) {
2338307Sjkh		device_printf(dev, "couldn't load map (err = %d)\n", err);
2348307Sjkh		goto abort_with_mem;
2358307Sjkh	}
2368307Sjkh	return 0;
2378307Sjkh
2388549Sjkhabort_with_mem:
2398549Sjkh	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
2408307Sjkhabort_with_dmat:
2418208Sjkh	(void)bus_dma_tag_destroy(dma->dmat);
2428336Sjkh	return err;
2438336Sjkh}
2448336Sjkh
2458307Sjkh
2468307Sjkhstatic void
2478307Sjkhmxge_dma_free(mxge_dma_t *dma)
2488336Sjkh{
2498307Sjkh	bus_dmamap_unload(dma->dmat, dma->map);
2508307Sjkh	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
25112661Speter	(void)bus_dma_tag_destroy(dma->dmat);
25212661Speter}
25312661Speter
25412661Speter/*
25512661Speter * The eeprom strings on the lanaiX have the format
25612661Speter * SN=x\0
25712661Speter * MAC=x:x:x:x:x:x\0
25812661Speter * PC=text\0
25912661Speter */
26012661Speter
26112661Speterstatic int
26212661Spetermxge_parse_strings(mxge_softc_t *sc)
26312661Speter{
26412661Speter#define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
26512661Speter
26612661Speter	char *ptr, *limit;
26712661Speter	int i, found_mac;
26812661Speter
26912661Speter	ptr = sc->eeprom_strings;
27012661Speter	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
27112661Speter	found_mac = 0;
27212661Speter	while (ptr < limit && *ptr != '\0') {
27312661Speter		if (memcmp(ptr, "MAC=", 4) == 0) {
27412661Speter			ptr += 1;
27512661Speter			sc->mac_addr_string = ptr;
27612661Speter			for (i = 0; i < 6; i++) {
27712661Speter				ptr += 3;
27812661Speter				if ((ptr + 2) > limit)
27912661Speter					goto abort;
28012661Speter				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
28114670Sjkh				found_mac = 1;
28212661Speter			}
28312661Speter		} else if (memcmp(ptr, "PC=", 3) == 0) {
28412661Speter			ptr += 3;
28512661Speter			strncpy(sc->product_code_string, ptr,
2868549Sjkh				sizeof (sc->product_code_string) - 1);
2878307Sjkh		} else if (memcmp(ptr, "SN=", 3) == 0) {
28812661Speter			ptr += 3;
2898810Sjkh			strncpy(sc->serial_number_string, ptr,
29012661Speter				sizeof (sc->serial_number_string) - 1);
2918549Sjkh		}
29212661Speter		MXGE_NEXT_STRING(ptr);
29312661Speter	}
29412661Speter
29512661Speter	if (found_mac)
2968810Sjkh		return 0;
2978810Sjkh
2988810Sjkh abort:
2998810Sjkh	device_printf(sc->dev, "failed to parse eeprom_strings\n");
3008810Sjkh
3018810Sjkh	return ENXIO;
3028810Sjkh}
3038810Sjkh
3048208Sjkh#if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
3058576Sjkhstatic void
3068881Srgrimesmxge_enable_nvidia_ecrc(mxge_softc_t *sc)
3078735Sjkh{
3088576Sjkh	uint32_t val;
3098576Sjkh	unsigned long base, off;
3108576Sjkh	char *va, *cfgptr;
3118576Sjkh	device_t pdev, mcp55;
3128636Sjkh	uint16_t vendor_id, device_id, word;
3138576Sjkh	uintptr_t bus, slot, func, ivend, idev;
3149202Srgrimes	uint32_t *ptr32;
3158576Sjkh
3168576Sjkh
3178576Sjkh	if (!mxge_nvidia_ecrc_enable)
3188576Sjkh		return;
3199202Srgrimes
3208097Sjkh	pdev = device_get_parent(device_get_parent(sc->dev));
3219202Srgrimes	if (pdev == NULL) {
3228576Sjkh		device_printf(sc->dev, "could not find parent?\n");
3238660Sjkh		return;
3248715Sjkh	}
3258576Sjkh	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
32610882Speter	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
3278576Sjkh
3288576Sjkh	if (vendor_id != 0x10de)
3298576Sjkh		return;
33012661Speter
3318576Sjkh	base = 0;
3328677Sjkh
3338576Sjkh	if (device_id == 0x005d) {
3348576Sjkh		/* ck804, base address is magic */
3358677Sjkh		base = 0xe0000000UL;
3368677Sjkh	} else if (device_id >= 0x0374 && device_id <= 0x378) {
3378810Sjkh		/* mcp55, base address stored in chipset */
3388722Sjkh		mcp55 = pci_find_bsf(0, 0, 0);
3398810Sjkh		if (mcp55 &&
3408810Sjkh		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
3418810Sjkh		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
3428810Sjkh			word = pci_read_config(mcp55, 0x90, 2);
34310882Speter			base = ((unsigned long)word & 0x7ffeU) << 25;
3448576Sjkh		}
34512661Speter	}
3469202Srgrimes	if (!base)
3478576Sjkh		return;
3488576Sjkh
3498576Sjkh	/* XXXX
3508576Sjkh	   Test below is commented because it is believed that doing
3518576Sjkh	   config read/write beyond 0xff will access the config space
3528576Sjkh	   for the next larger function.  Uncomment this and remove
3538576Sjkh	   the hacky pmap_mapdev() way of accessing config space when
35412661Speter	   FreeBSD grows support for extended pcie config space access
3558576Sjkh	*/
3568576Sjkh#if 0
3578208Sjkh	/* See if we can, by some miracle, access the extended
3588107Sjkh	   config space */
35912661Speter	val = pci_read_config(pdev, 0x178, 4);
36012661Speter	if (val != 0xffffffff) {
36112661Speter		val |= 0x40;
36212661Speter		pci_write_config(pdev, 0x178, val, 4);
36312661Speter		return;
36412661Speter	}
3658792Sjkh#endif
36612661Speter	/* Rather than using normal pci config space writes, we must
36712661Speter	 * map the Nvidia config space ourselves.  This is because on
36812661Speter	 * opteron/nvidia class machine the 0xe000000 mapping is
3698792Sjkh	 * handled by the nvidia chipset, that means the internal PCI
3708792Sjkh	 * device (the on-chip northbridge), or the amd-8131 bridge
3718792Sjkh	 * and things behind them are not visible by this method.
37214321Sjkh	 */
3738792Sjkh
3748792Sjkh	BUS_READ_IVAR(device_get_parent(pdev), pdev,
3758347Sjkh		      PCI_IVAR_BUS, &bus);
3768347Sjkh	BUS_READ_IVAR(device_get_parent(pdev), pdev,
3778347Sjkh		      PCI_IVAR_SLOT, &slot);
3788347Sjkh	BUS_READ_IVAR(device_get_parent(pdev), pdev,
3798549Sjkh		      PCI_IVAR_FUNCTION, &func);
3808549Sjkh	BUS_READ_IVAR(device_get_parent(pdev), pdev,
3818347Sjkh		      PCI_IVAR_VENDOR, &ivend);
3828705Sjkh	BUS_READ_IVAR(device_get_parent(pdev), pdev,
38312661Speter		      PCI_IVAR_DEVICE, &idev);
3848722Sjkh
3858722Sjkh	off =  base
3868722Sjkh		+ 0x00100000UL * (unsigned long)bus
3878722Sjkh		+ 0x00001000UL * (unsigned long)(func
3888722Sjkh						 + 8 * slot);
3899202Srgrimes
3909202Srgrimes	/* map it into the kernel */
3918705Sjkh	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
3929202Srgrimes
3939202Srgrimes
3949202Srgrimes	if (va == NULL) {
3958405Sjkh		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
3968405Sjkh		return;
39712661Speter	}
39812661Speter	/* get a pointer to the config space mapped into the kernel */
3998405Sjkh	cfgptr = va + (off & PAGE_MASK);
4008351Sjkh
4018549Sjkh	/* make sure that we can really access it */
4028556Sjkh	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
4038556Sjkh	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
4048636Sjkh	if (! (vendor_id == ivend && device_id == idev)) {
4058641Sjkh		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
4068641Sjkh			      vendor_id, device_id);
40714321Sjkh		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
4088722Sjkh		return;
4098722Sjkh	}
4109202Srgrimes
41114321Sjkh	ptr32 = (uint32_t*)(cfgptr + 0x178);
4129202Srgrimes	val = *ptr32;
4139202Srgrimes
4148097Sjkh	if (val == 0xffffffff) {
4158351Sjkh		device_printf(sc->dev, "extended mapping failed\n");
4168556Sjkh		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
41710882Speter		return;
4188097Sjkh	}
4198278Sjkh	*ptr32 = val | 0x40;
4208751Sjkh	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
42112661Speter	if (mxge_verbose)
4228278Sjkh		device_printf(sc->dev,
4238278Sjkh			      "Enabled ECRC on upstream Nvidia bridge "
4249202Srgrimes			      "at %d:%d:%d\n",
4258278Sjkh			      (int)bus, (int)slot, (int)func);
4268278Sjkh	return;
4278278Sjkh}
4288278Sjkh#else
4299202Srgrimesstatic void
4308405Sjkhmxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
4319202Srgrimes{
43210882Speter	device_printf(sc->dev,
4338278Sjkh		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
4348107Sjkh	return;
4359202Srgrimes}
4369202Srgrimes#endif
4379202Srgrimes
4389202Srgrimes
4399202Srgrimesstatic int
4408097Sjkhmxge_dma_test(mxge_softc_t *sc, int test_type)
44112661Speter{
44212661Speter	mxge_cmd_t cmd;
44312661Speter	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
44412661Speter	int status;
4458792Sjkh	uint32_t len;
4468792Sjkh	char *test = " ";
44714321Sjkh
4488792Sjkh
4498792Sjkh	/* Run a small DMA test.
4508792Sjkh	 * The magic multipliers to the length tell the firmware
4518792Sjkh	 * to do DMA read, write, or read+write tests.  The
4528792Sjkh	 * results are returned in cmd.data0.  The upper 16
45314321Sjkh	 * bits of the return is the number of transfers completed.
4548792Sjkh	 * The lower 16 bits is the time in 0.5us ticks that the
4558792Sjkh	 * transfers took to complete.
4568792Sjkh	 */
4578792Sjkh
4588792Sjkh	len = sc->tx.boundary;
45914321Sjkh
4608792Sjkh	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
4618792Sjkh	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
4628351Sjkh	cmd.data2 = len * 0x10000;
4638351Sjkh	status = mxge_send_cmd(sc, test_type, &cmd);
4648351Sjkh	if (status != 0) {
46512661Speter		test = "read";
46612661Speter		goto abort;
46712661Speter	}
46812661Speter	sc->read_dma = ((cmd.data0>>16) * len * 2) /
46912661Speter		(cmd.data0 & 0xffff);
47012661Speter	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
47112661Speter	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
47212661Speter	cmd.data2 = len * 0x1;
47314670Sjkh	status = mxge_send_cmd(sc, test_type, &cmd);
47412661Speter	if (status != 0) {
4758351Sjkh		test = "write";
4768556Sjkh		goto abort;
47710882Speter	}
47812661Speter	sc->write_dma = ((cmd.data0>>16) * len * 2) /
47912661Speter		(cmd.data0 & 0xffff);
48012661Speter
48112661Speter	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
48212661Speter	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
48312661Speter	cmd.data2 = len * 0x10001;
48412661Speter	status = mxge_send_cmd(sc, test_type, &cmd);
48512661Speter	if (status != 0) {
48612661Speter		test = "read/write";
4878351Sjkh		goto abort;
48812661Speter	}
48912661Speter	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
49012661Speter		(cmd.data0 & 0xffff);
49112661Speter
49212661Speterabort:
4938351Sjkh	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
4948549Sjkh		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
4958576Sjkh			      test, status);
49610882Speter
4978549Sjkh	return status;
49812661Speter}
49912661Speter
50012661Speter/*
5018351Sjkh * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
5028405Sjkh * when the PCI-E Completion packets are aligned on an 8-byte
5038405Sjkh * boundary.  Some PCI-E chip sets always align Completion packets; on
5048405Sjkh * the ones that do not, the alignment can be enforced by enabling
5058405Sjkh * ECRC generation (if supported).
5068405Sjkh *
5078405Sjkh * When PCI-E Completion packets are not aligned, it is actually more
5088405Sjkh * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
5098601Sjkh *
5108601Sjkh * If the driver can neither enable ECRC nor verify that it has
5118629Sjkh * already been enabled, then it must use a firmware image which works
5128629Sjkh * around unaligned completion packets (ethp_z8e.dat), and it should
5138629Sjkh * also ensure that it never gives the device a Read-DMA which is
5148351Sjkh * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
5158351Sjkh * enabled, then the driver should use the aligned (eth_z8e.dat)
51612661Speter * firmware image, and set tx.boundary to 4KB.
5178351Sjkh */
5188351Sjkh
5198351Sjkhstatic int
5208351Sjkhmxge_firmware_probe(mxge_softc_t *sc)
5218351Sjkh{
52210882Speter	device_t dev = sc->dev;
52310882Speter	int reg, status;
5248837Sjkh	uint16_t pectl;
5258837Sjkh
52612661Speter	sc->tx.boundary = 4096;
52712661Speter	/*
52812661Speter	 * Verify the max read request size was set to 4KB
52912661Speter	 * before trying the test with 4KB.
5308837Sjkh	 */
5318837Sjkh	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
5328799Sphk		pectl = pci_read_config(dev, reg + 0x8, 2);
5338556Sjkh		if ((pectl & (5 << 12)) != (5 << 12)) {
5348576Sjkh			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
5358107Sjkh				      pectl);
5368097Sjkh			sc->tx.boundary = 2048;
5378097Sjkh		}
53814670Sjkh	}
5398097Sjkh
54012661Speter	/*
5418097Sjkh	 * load the optimized firmware (which assumes aligned PCIe
5428097Sjkh	 * completions) in order to see if it works on this host.
54312661Speter	 */
54412661Speter	sc->fw_name = mxge_fw_aligned;
5458097Sjkh	status = mxge_load_firmware(sc);
5468208Sjkh	if (status != 0) {
5478363Sjkh		return status;
5488097Sjkh	}
5498174Sjkh
5508174Sjkh	/*
5518097Sjkh	 * Enable ECRC if possible
5528556Sjkh	 */
5538556Sjkh	mxge_enable_nvidia_ecrc(sc);
5548097Sjkh
5558107Sjkh	/*
5568837Sjkh	 * Run a DMA test which watches for unaligned completions and
5578097Sjkh	 * aborts on the first one seen.
5588262Sjkh	 */
5598097Sjkh
5608347Sjkh	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
5618097Sjkh	if (status == 0)
5628097Sjkh		return 0; /* keep the aligned firmware */
5638208Sjkh
5648313Sjkh	if (status != E2BIG)
5658705Sjkh		device_printf(dev, "DMA test failed: %d\n", status);
5668208Sjkh	if (status == ENOSYS)
5678262Sjkh		device_printf(dev, "Falling back to ethp! "
56812661Speter			      "Please install up to date fw\n");
56912661Speter	return status;
5708097Sjkh}
5718792Sjkh
5728792Sjkhstatic int
5738792Sjkhmxge_select_firmware(mxge_softc_t *sc)
5748792Sjkh{
5758837Sjkh	int aligned = 0;
5768837Sjkh
57714321Sjkh
5788837Sjkh	if (mxge_force_firmware != 0) {
5798837Sjkh		if (mxge_force_firmware == 1)
58012661Speter			aligned = 1;
58112661Speter		else
58212661Speter			aligned = 0;
58312661Speter		if (mxge_verbose)
58412661Speter			device_printf(sc->dev,
58512661Speter				      "Assuming %s completions (forced)\n",
58612661Speter				      aligned ? "aligned" : "unaligned");
5878351Sjkh		goto abort;
5888351Sjkh	}
5898351Sjkh
59012661Speter	/* if the PCIe link width is 4 or less, we can use the aligned
5918351Sjkh	   firmware and skip any checks */
59212661Speter	if (sc->link_width != 0 && sc->link_width <= 4) {
5938351Sjkh		device_printf(sc->dev,
5948351Sjkh			      "PCIe x%d Link, expect reduced performance\n",
5958351Sjkh			      sc->link_width);
59612661Speter		aligned = 1;
5978351Sjkh		goto abort;
59812661Speter	}
5998351Sjkh
60012661Speter	if (0 == mxge_firmware_probe(sc))
60112661Speter		return 0;
6028107Sjkh
6038792Sjkhabort:
60412661Speter	if (aligned) {
6058792Sjkh		sc->fw_name = mxge_fw_aligned;
60614321Sjkh		sc->tx.boundary = 4096;
6078792Sjkh	} else {
6088792Sjkh		sc->fw_name = mxge_fw_unaligned;
6098556Sjkh		sc->tx.boundary = 2048;
6108768Sjkh	}
6119202Srgrimes	return (mxge_load_firmware(sc));
61212661Speter}
6139202Srgrimes
6148556Sjkhunion qualhack
6158351Sjkh{
6168351Sjkh        const char *ro_char;
6178208Sjkh        char *rw_char;
6188792Sjkh};
6198837Sjkh
6208792Sjkhstatic int
62114321Sjkhmxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
6228792Sjkh{
62312661Speter
6248208Sjkh
6258208Sjkh	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
62612661Speter		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
62712661Speter			      be32toh(hdr->mcp_type));
62812661Speter		return EIO;
6298208Sjkh	}
6308281Sjkh
6318549Sjkh	/* save firmware version for sysctl */
6328281Sjkh	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
6338097Sjkh	if (mxge_verbose)
6348097Sjkh		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
635
636	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
637	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
638
639	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
640	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
641		device_printf(sc->dev, "Found firmware version %s\n",
642			      sc->fw_version);
643		device_printf(sc->dev, "Driver needs %d.%d\n",
644			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
645		return EINVAL;
646	}
647	return 0;
648
649}
650
651static int
652mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
653{
654	const struct firmware *fw;
655	const mcp_gen_header_t *hdr;
656	unsigned hdr_offset;
657	const char *fw_data;
658	union qualhack hack;
659	int status;
660	unsigned int i;
661	char dummy;
662
663
664	fw = firmware_get(sc->fw_name);
665
666	if (fw == NULL) {
667		device_printf(sc->dev, "Could not find firmware image %s\n",
668			      sc->fw_name);
669		return ENOENT;
670	}
671	if (fw->datasize > *limit ||
672	    fw->datasize < MCP_HEADER_PTR_OFFSET + 4) {
673		device_printf(sc->dev, "Firmware image %s too large (%d/%d)\n",
674			      sc->fw_name, (int)fw->datasize, (int) *limit);
675		status = ENOSPC;
676		goto abort_with_fw;
677	}
678	*limit = fw->datasize;
679
680	/* check id */
681	fw_data = (const char *)fw->data;
682	hdr_offset = htobe32(*(const uint32_t *)
683			     (fw_data + MCP_HEADER_PTR_OFFSET));
684	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->datasize) {
685		device_printf(sc->dev, "Bad firmware file");
686		status = EIO;
687		goto abort_with_fw;
688	}
689	hdr = (const void*)(fw_data + hdr_offset);
690
691	status = mxge_validate_firmware(sc, hdr);
692	if (status != 0)
693		goto abort_with_fw;
694
695	hack.ro_char = fw_data;
696	/* Copy the inflated firmware to NIC SRAM. */
697	for (i = 0; i < *limit; i += 256) {
698		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
699			      hack.rw_char + i,
700			      min(256U, (unsigned)(*limit - i)));
701		mb();
702		dummy = *sc->sram;
703		mb();
704	}
705
706	status = 0;
707abort_with_fw:
708	firmware_put(fw, FIRMWARE_UNLOAD);
709	return status;
710}
711
712/*
713 * Enable or disable periodic RDMAs from the host to make certain
714 * chipsets resend dropped PCIe messages
715 */
716
717static void
718mxge_dummy_rdma(mxge_softc_t *sc, int enable)
719{
720	char buf_bytes[72];
721	volatile uint32_t *confirm;
722	volatile char *submit;
723	uint32_t *buf, dma_low, dma_high;
724	int i;
725
726	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
727
728	/* clear confirmation addr */
729	confirm = (volatile uint32_t *)sc->cmd;
730	*confirm = 0;
731	mb();
732
733	/* send an rdma command to the PCIe engine, and wait for the
734	   response in the confirmation address.  The firmware should
735	   write a -1 there to indicate it is alive and well
736	*/
737
738	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
739	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
740	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
741	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
742	buf[2] = htobe32(0xffffffff);		/* confirm data */
743	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
744	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
745	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
746	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
747	buf[5] = htobe32(enable);			/* enable? */
748
749
750	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
751
752	mxge_pio_copy(submit, buf, 64);
753	mb();
754	DELAY(1000);
755	mb();
756	i = 0;
757	while (*confirm != 0xffffffff && i < 20) {
758		DELAY(1000);
759		i++;
760	}
761	if (*confirm != 0xffffffff) {
762		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
763			      (enable ? "enable" : "disable"), confirm,
764			      *confirm);
765	}
766	return;
767}
768
769static int
770mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
771{
772	mcp_cmd_t *buf;
773	char buf_bytes[sizeof(*buf) + 8];
774	volatile mcp_cmd_response_t *response = sc->cmd;
775	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
776	uint32_t dma_low, dma_high;
777	int err, sleep_total = 0;
778
779	/* ensure buf is aligned to 8 bytes */
780	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
781
782	buf->data0 = htobe32(data->data0);
783	buf->data1 = htobe32(data->data1);
784	buf->data2 = htobe32(data->data2);
785	buf->cmd = htobe32(cmd);
786	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
787	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
788
789	buf->response_addr.low = htobe32(dma_low);
790	buf->response_addr.high = htobe32(dma_high);
791	mtx_lock(&sc->cmd_mtx);
792	response->result = 0xffffffff;
793	mb();
794	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
795
796	/* wait up to 20ms */
797	err = EAGAIN;
798	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
799		bus_dmamap_sync(sc->cmd_dma.dmat,
800				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
801		mb();
802		switch (be32toh(response->result)) {
803		case 0:
804			data->data0 = be32toh(response->data);
805			err = 0;
806			break;
807		case 0xffffffff:
808			DELAY(1000);
809			break;
810		case MXGEFW_CMD_UNKNOWN:
811			err = ENOSYS;
812			break;
813		case MXGEFW_CMD_ERROR_UNALIGNED:
814			err = E2BIG;
815			break;
816		default:
817			device_printf(sc->dev,
818				      "mxge: command %d "
819				      "failed, result = %d\n",
820				      cmd, be32toh(response->result));
821			err = ENXIO;
822			break;
823		}
824		if (err != EAGAIN)
825			break;
826	}
827	if (err == EAGAIN)
828		device_printf(sc->dev, "mxge: command %d timed out"
829			      "result = %d\n",
830			      cmd, be32toh(response->result));
831	mtx_unlock(&sc->cmd_mtx);
832	return err;
833}
834
835static int
836mxge_adopt_running_firmware(mxge_softc_t *sc)
837{
838	struct mcp_gen_header *hdr;
839	const size_t bytes = sizeof (struct mcp_gen_header);
840	size_t hdr_offset;
841	int status;
842
843	/* find running firmware header */
844	hdr_offset = htobe32(*(volatile uint32_t *)
845			     (sc->sram + MCP_HEADER_PTR_OFFSET));
846
847	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
848		device_printf(sc->dev,
849			      "Running firmware has bad header offset (%d)\n",
850			      (int)hdr_offset);
851		return EIO;
852	}
853
854	/* copy header of running firmware from SRAM to host memory to
855	 * validate firmware */
856	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
857	if (hdr == NULL) {
858		device_printf(sc->dev, "could not malloc firmware hdr\n");
859		return ENOMEM;
860	}
861	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
862				rman_get_bushandle(sc->mem_res),
863				hdr_offset, (char *)hdr, bytes);
864	status = mxge_validate_firmware(sc, hdr);
865	free(hdr, M_DEVBUF);
866
867	/*
868	 * check to see if adopted firmware has bug where adopting
869	 * it will cause broadcasts to be filtered unless the NIC
870	 * is kept in ALLMULTI mode
871	 */
872	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
873	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
874		sc->adopted_rx_filter_bug = 1;
875		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
876			      "working around rx filter bug\n",
877			      sc->fw_ver_major, sc->fw_ver_minor,
878			      sc->fw_ver_tiny);
879	}
880
881	return status;
882}
883
884
885static int
886mxge_load_firmware(mxge_softc_t *sc)
887{
888	volatile uint32_t *confirm;
889	volatile char *submit;
890	char buf_bytes[72];
891	uint32_t *buf, size, dma_low, dma_high;
892	int status, i;
893
894	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
895
896	size = sc->sram_size;
897	status = mxge_load_firmware_helper(sc, &size);
898	if (status) {
899		/* Try to use the currently running firmware, if
900		   it is new enough */
901		status = mxge_adopt_running_firmware(sc);
902		if (status) {
903			device_printf(sc->dev,
904				      "failed to adopt running firmware\n");
905			return status;
906		}
907		device_printf(sc->dev,
908			      "Successfully adopted running firmware\n");
909		if (sc->tx.boundary == 4096) {
910			device_printf(sc->dev,
911				"Using firmware currently running on NIC"
912				 ".  For optimal\n");
913			device_printf(sc->dev,
914				 "performance consider loading optimized "
915				 "firmware\n");
916		}
917		sc->fw_name = mxge_fw_unaligned;
918		sc->tx.boundary = 2048;
919		return 0;
920	}
921	/* clear confirmation addr */
922	confirm = (volatile uint32_t *)sc->cmd;
923	*confirm = 0;
924	mb();
925	/* send a reload command to the bootstrap MCP, and wait for the
926	   response in the confirmation address.  The firmware should
927	   write a -1 there to indicate it is alive and well
928	*/
929
930	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
931	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
932
933	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
934	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
935	buf[2] = htobe32(0xffffffff);	/* confirm data */
936
937	/* FIX: All newest firmware should un-protect the bottom of
938	   the sram before handoff. However, the very first interfaces
939	   do not. Therefore the handoff copy must skip the first 8 bytes
940	*/
941					/* where the code starts*/
942	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
943	buf[4] = htobe32(size - 8); 	/* length of code */
944	buf[5] = htobe32(8);		/* where to copy to */
945	buf[6] = htobe32(0);		/* where to jump to */
946
947	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
948	mxge_pio_copy(submit, buf, 64);
949	mb();
950	DELAY(1000);
951	mb();
952	i = 0;
953	while (*confirm != 0xffffffff && i < 20) {
954		DELAY(1000*10);
955		i++;
956		bus_dmamap_sync(sc->cmd_dma.dmat,
957				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
958	}
959	if (*confirm != 0xffffffff) {
960		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
961			confirm, *confirm);
962
963		return ENXIO;
964	}
965	return 0;
966}
967
968static int
969mxge_update_mac_address(mxge_softc_t *sc)
970{
971	mxge_cmd_t cmd;
972	uint8_t *addr = sc->mac_addr;
973	int status;
974
975
976	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
977		     | (addr[2] << 8) | addr[3]);
978
979	cmd.data1 = ((addr[4] << 8) | (addr[5]));
980
981	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
982	return status;
983}
984
985static int
986mxge_change_pause(mxge_softc_t *sc, int pause)
987{
988	mxge_cmd_t cmd;
989	int status;
990
991	if (pause)
992		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
993				       &cmd);
994	else
995		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
996				       &cmd);
997
998	if (status) {
999		device_printf(sc->dev, "Failed to set flow control mode\n");
1000		return ENXIO;
1001	}
1002	sc->pause = pause;
1003	return 0;
1004}
1005
1006static void
1007mxge_change_promisc(mxge_softc_t *sc, int promisc)
1008{
1009	mxge_cmd_t cmd;
1010	int status;
1011
1012	if (promisc)
1013		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1014				       &cmd);
1015	else
1016		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1017				       &cmd);
1018
1019	if (status) {
1020		device_printf(sc->dev, "Failed to set promisc mode\n");
1021	}
1022}
1023
1024static void
1025mxge_set_multicast_list(mxge_softc_t *sc)
1026{
1027	mxge_cmd_t cmd;
1028	struct ifmultiaddr *ifma;
1029	struct ifnet *ifp = sc->ifp;
1030	int err;
1031
1032	/* This firmware is known to not support multicast */
1033	if (!sc->fw_multicast_support)
1034		return;
1035
1036	/* Disable multicast filtering while we play with the lists*/
1037	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1038	if (err != 0) {
1039		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1040		       " error status: %d\n", err);
1041		return;
1042	}
1043
1044	if (sc->adopted_rx_filter_bug)
1045		return;
1046
1047	if (ifp->if_flags & IFF_ALLMULTI)
1048		/* request to disable multicast filtering, so quit here */
1049		return;
1050
1051	/* Flush all the filters */
1052
1053	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1054	if (err != 0) {
1055		device_printf(sc->dev,
1056			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1057			      ", error status: %d\n", err);
1058		return;
1059	}
1060
1061	/* Walk the multicast list, and add each address */
1062
1063	IF_ADDR_LOCK(ifp);
1064	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1065		if (ifma->ifma_addr->sa_family != AF_LINK)
1066			continue;
1067		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1068		      &cmd.data0, 4);
1069		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1070		      &cmd.data1, 2);
1071		cmd.data0 = htonl(cmd.data0);
1072		cmd.data1 = htonl(cmd.data1);
1073		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1074		if (err != 0) {
1075			device_printf(sc->dev, "Failed "
1076			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1077			       "%d\t", err);
1078			/* abort, leaving multicast filtering off */
1079			IF_ADDR_UNLOCK(ifp);
1080			return;
1081		}
1082	}
1083	IF_ADDR_UNLOCK(ifp);
1084	/* Enable multicast filtering */
1085	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1086	if (err != 0) {
1087		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1088		       ", error status: %d\n", err);
1089	}
1090}
1091
1092static int
1093mxge_max_mtu(mxge_softc_t *sc)
1094{
1095	mxge_cmd_t cmd;
1096	int status;
1097
1098	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1099		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1100
1101	/* try to set nbufs to see if it we can
1102	   use virtually contiguous jumbos */
1103	cmd.data0 = 0;
1104	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1105			       &cmd);
1106	if (status == 0)
1107		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1108
1109	/* otherwise, we're limited to MJUMPAGESIZE */
1110	return MJUMPAGESIZE - MXGEFW_PAD;
1111}
1112
1113static int
1114mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1115{
1116
1117	mxge_cmd_t cmd;
1118	size_t bytes;
1119	int status;
1120
1121	/* try to send a reset command to the card to see if it
1122	   is alive */
1123	memset(&cmd, 0, sizeof (cmd));
1124	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1125	if (status != 0) {
1126		device_printf(sc->dev, "failed reset\n");
1127		return ENXIO;
1128	}
1129
1130	mxge_dummy_rdma(sc, 1);
1131
1132	if (interrupts_setup) {
1133		/* Now exchange information about interrupts  */
1134		bytes = (sc->rx_done.mask + 1) * sizeof (*sc->rx_done.entry);
1135		memset(sc->rx_done.entry, 0, bytes);
1136		cmd.data0 = (uint32_t)bytes;
1137		status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1138		cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
1139		cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
1140		status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
1141	}
1142
1143	status |= mxge_send_cmd(sc,
1144				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1145
1146
1147	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1148
1149	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1150	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1151
1152
1153	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1154				&cmd);
1155	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1156	if (status != 0) {
1157		device_printf(sc->dev, "failed set interrupt parameters\n");
1158		return status;
1159	}
1160
1161
1162	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1163
1164
1165	/* run a DMA benchmark */
1166	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1167
1168	/* reset mcp/driver shared state back to 0 */
1169	sc->rx_done.idx = 0;
1170	sc->rx_done.cnt = 0;
1171	sc->tx.req = 0;
1172	sc->tx.done = 0;
1173	sc->tx.pkt_done = 0;
1174	sc->tx.wake = 0;
1175	sc->tx_defrag = 0;
1176	sc->tx.stall = 0;
1177	sc->rx_big.cnt = 0;
1178	sc->rx_small.cnt = 0;
1179	sc->rdma_tags_available = 15;
1180	sc->fw_stats->valid = 0;
1181	sc->fw_stats->send_done_count = 0;
1182	sc->lro_bad_csum = 0;
1183	sc->lro_queued = 0;
1184	sc->lro_flushed = 0;
1185	status = mxge_update_mac_address(sc);
1186	mxge_change_promisc(sc, 0);
1187	mxge_change_pause(sc, sc->pause);
1188	mxge_set_multicast_list(sc);
1189	return status;
1190}
1191
1192static int
1193mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1194{
1195        mxge_softc_t *sc;
1196        unsigned int intr_coal_delay;
1197        int err;
1198
1199        sc = arg1;
1200        intr_coal_delay = sc->intr_coal_delay;
1201        err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1202        if (err != 0) {
1203                return err;
1204        }
1205        if (intr_coal_delay == sc->intr_coal_delay)
1206                return 0;
1207
1208        if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1209                return EINVAL;
1210
1211	mtx_lock(&sc->driver_mtx);
1212	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1213	sc->intr_coal_delay = intr_coal_delay;
1214
1215	mtx_unlock(&sc->driver_mtx);
1216        return err;
1217}
1218
1219static int
1220mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1221{
1222        mxge_softc_t *sc;
1223        unsigned int enabled;
1224        int err;
1225
1226        sc = arg1;
1227        enabled = sc->pause;
1228        err = sysctl_handle_int(oidp, &enabled, arg2, req);
1229        if (err != 0) {
1230                return err;
1231        }
1232        if (enabled == sc->pause)
1233                return 0;
1234
1235	mtx_lock(&sc->driver_mtx);
1236	err = mxge_change_pause(sc, enabled);
1237	mtx_unlock(&sc->driver_mtx);
1238        return err;
1239}
1240
1241static int
1242mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1243{
1244	struct ifnet *ifp;
1245	int err;
1246
1247	ifp = sc->ifp;
1248	if (lro_cnt == 0)
1249		ifp->if_capenable &= ~IFCAP_LRO;
1250	else
1251		ifp->if_capenable |= IFCAP_LRO;
1252	sc->lro_cnt = lro_cnt;
1253	callout_stop(&sc->co_hdl);
1254	mxge_close(sc);
1255	err = mxge_open(sc);
1256	if (err == 0)
1257		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
1258	return err;
1259}
1260
1261static int
1262mxge_change_lro(SYSCTL_HANDLER_ARGS)
1263{
1264	mxge_softc_t *sc;
1265	unsigned int lro_cnt;
1266	int err;
1267
1268	sc = arg1;
1269	lro_cnt = sc->lro_cnt;
1270	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1271	if (err != 0)
1272		return err;
1273
1274	if (lro_cnt == sc->lro_cnt)
1275		return 0;
1276
1277	if (lro_cnt > 128)
1278		return EINVAL;
1279
1280	mtx_lock(&sc->driver_mtx);
1281	err = mxge_change_lro_locked(sc, lro_cnt);
1282	mtx_unlock(&sc->driver_mtx);
1283	return err;
1284}
1285
1286static int
1287mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1288{
1289        int err;
1290
1291        if (arg1 == NULL)
1292                return EFAULT;
1293        arg2 = be32toh(*(int *)arg1);
1294        arg1 = NULL;
1295        err = sysctl_handle_int(oidp, arg1, arg2, req);
1296
1297        return err;
1298}
1299
1300static void
1301mxge_add_sysctls(mxge_softc_t *sc)
1302{
1303	struct sysctl_ctx_list *ctx;
1304	struct sysctl_oid_list *children;
1305	mcp_irq_data_t *fw;
1306
1307	ctx = device_get_sysctl_ctx(sc->dev);
1308	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1309	fw = sc->fw_stats;
1310
1311	/* random information */
1312	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1313		       "firmware_version",
1314		       CTLFLAG_RD, &sc->fw_version,
1315		       0, "firmware version");
1316	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1317		       "serial_number",
1318		       CTLFLAG_RD, &sc->serial_number_string,
1319		       0, "serial number");
1320	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1321		       "product_code",
1322		       CTLFLAG_RD, &sc->product_code_string,
1323		       0, "product_code");
1324	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1325		       "pcie_link_width",
1326		       CTLFLAG_RD, &sc->link_width,
1327		       0, "tx_boundary");
1328	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1329		       "tx_boundary",
1330		       CTLFLAG_RD, &sc->tx.boundary,
1331		       0, "tx_boundary");
1332	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1333		       "write_combine",
1334		       CTLFLAG_RD, &sc->wc,
1335		       0, "write combining PIO?");
1336	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1337		       "read_dma_MBs",
1338		       CTLFLAG_RD, &sc->read_dma,
1339		       0, "DMA Read speed in MB/s");
1340	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1341		       "write_dma_MBs",
1342		       CTLFLAG_RD, &sc->write_dma,
1343		       0, "DMA Write speed in MB/s");
1344	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1345		       "read_write_dma_MBs",
1346		       CTLFLAG_RD, &sc->read_write_dma,
1347		       0, "DMA concurrent Read/Write speed in MB/s");
1348
1349
1350	/* performance related tunables */
1351	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1352			"intr_coal_delay",
1353			CTLTYPE_INT|CTLFLAG_RW, sc,
1354			0, mxge_change_intr_coal,
1355			"I", "interrupt coalescing delay in usecs");
1356
1357	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1358			"flow_control_enabled",
1359			CTLTYPE_INT|CTLFLAG_RW, sc,
1360			0, mxge_change_flow_control,
1361			"I", "interrupt coalescing delay in usecs");
1362
1363	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1364		       "deassert_wait",
1365		       CTLFLAG_RW, &mxge_deassert_wait,
1366		       0, "Wait for IRQ line to go low in ihandler");
1367
1368	/* stats block from firmware is in network byte order.
1369	   Need to swap it */
1370	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1371			"link_up",
1372			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1373			0, mxge_handle_be32,
1374			"I", "link up");
1375	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1376			"rdma_tags_available",
1377			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1378			0, mxge_handle_be32,
1379			"I", "rdma_tags_available");
1380	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1381			"dropped_bad_crc32",
1382			CTLTYPE_INT|CTLFLAG_RD,
1383			&fw->dropped_bad_crc32,
1384			0, mxge_handle_be32,
1385			"I", "dropped_bad_crc32");
1386	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1387			"dropped_bad_phy",
1388			CTLTYPE_INT|CTLFLAG_RD,
1389			&fw->dropped_bad_phy,
1390			0, mxge_handle_be32,
1391			"I", "dropped_bad_phy");
1392	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1393			"dropped_link_error_or_filtered",
1394			CTLTYPE_INT|CTLFLAG_RD,
1395			&fw->dropped_link_error_or_filtered,
1396			0, mxge_handle_be32,
1397			"I", "dropped_link_error_or_filtered");
1398	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1399			"dropped_link_overflow",
1400			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1401			0, mxge_handle_be32,
1402			"I", "dropped_link_overflow");
1403	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1404			"dropped_multicast_filtered",
1405			CTLTYPE_INT|CTLFLAG_RD,
1406			&fw->dropped_multicast_filtered,
1407			0, mxge_handle_be32,
1408			"I", "dropped_multicast_filtered");
1409	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1410			"dropped_no_big_buffer",
1411			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1412			0, mxge_handle_be32,
1413			"I", "dropped_no_big_buffer");
1414	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1415			"dropped_no_small_buffer",
1416			CTLTYPE_INT|CTLFLAG_RD,
1417			&fw->dropped_no_small_buffer,
1418			0, mxge_handle_be32,
1419			"I", "dropped_no_small_buffer");
1420	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1421			"dropped_overrun",
1422			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1423			0, mxge_handle_be32,
1424			"I", "dropped_overrun");
1425	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1426			"dropped_pause",
1427			CTLTYPE_INT|CTLFLAG_RD,
1428			&fw->dropped_pause,
1429			0, mxge_handle_be32,
1430			"I", "dropped_pause");
1431	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1432			"dropped_runt",
1433			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1434			0, mxge_handle_be32,
1435			"I", "dropped_runt");
1436
1437	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1438			"dropped_unicast_filtered",
1439			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1440			0, mxge_handle_be32,
1441			"I", "dropped_unicast_filtered");
1442
1443	/* host counters exported for debugging */
1444	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1445		       "rx_small_cnt",
1446		       CTLFLAG_RD, &sc->rx_small.cnt,
1447		       0, "rx_small_cnt");
1448	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1449		       "rx_big_cnt",
1450		       CTLFLAG_RD, &sc->rx_big.cnt,
1451		       0, "rx_small_cnt");
1452	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1453		       "tx_req",
1454		       CTLFLAG_RD, &sc->tx.req,
1455		       0, "tx_req");
1456	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1457		       "tx_done",
1458		       CTLFLAG_RD, &sc->tx.done,
1459		       0, "tx_done");
1460	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1461		       "tx_pkt_done",
1462		       CTLFLAG_RD, &sc->tx.pkt_done,
1463		       0, "tx_done");
1464	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1465		       "tx_stall",
1466		       CTLFLAG_RD, &sc->tx.stall,
1467		       0, "tx_stall");
1468	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1469		       "tx_wake",
1470		       CTLFLAG_RD, &sc->tx.wake,
1471		       0, "tx_wake");
1472	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1473		       "tx_defrag",
1474		       CTLFLAG_RD, &sc->tx_defrag,
1475		       0, "tx_defrag");
1476
1477	/* verbose printing? */
1478	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1479		       "verbose",
1480		       CTLFLAG_RW, &mxge_verbose,
1481		       0, "verbose printing");
1482
1483	/* lro */
1484	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1485			"lro_cnt",
1486			CTLTYPE_INT|CTLFLAG_RW, sc,
1487			0, mxge_change_lro,
1488			"I", "number of lro merge queues");
1489
1490	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1491		       "lro_flushed", CTLFLAG_RD, &sc->lro_flushed,
1492		       0, "number of lro merge queues flushed");
1493
1494	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1495		       "lro_queued", CTLFLAG_RD, &sc->lro_queued,
1496		       0, "number of frames appended to lro merge queues");
1497
1498}
1499
1500/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1501   backwards one at a time and handle ring wraps */
1502
1503static inline void
1504mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1505			    mcp_kreq_ether_send_t *src, int cnt)
1506{
1507        int idx, starting_slot;
1508        starting_slot = tx->req;
1509        while (cnt > 1) {
1510                cnt--;
1511                idx = (starting_slot + cnt) & tx->mask;
1512                mxge_pio_copy(&tx->lanai[idx],
1513			      &src[cnt], sizeof(*src));
1514                mb();
1515        }
1516}
1517
1518/*
1519 * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1520 * at most 32 bytes at a time, so as to avoid involving the software
1521 * pio handler in the nic.   We re-write the first segment's flags
1522 * to mark them valid only after writing the entire chain
1523 */
1524
1525static inline void
1526mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1527                  int cnt)
1528{
1529        int idx, i;
1530        uint32_t *src_ints;
1531	volatile uint32_t *dst_ints;
1532        mcp_kreq_ether_send_t *srcp;
1533	volatile mcp_kreq_ether_send_t *dstp, *dst;
1534	uint8_t last_flags;
1535
1536        idx = tx->req & tx->mask;
1537
1538	last_flags = src->flags;
1539	src->flags = 0;
1540        mb();
1541        dst = dstp = &tx->lanai[idx];
1542        srcp = src;
1543
1544        if ((idx + cnt) < tx->mask) {
1545                for (i = 0; i < (cnt - 1); i += 2) {
1546                        mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1547                        mb(); /* force write every 32 bytes */
1548                        srcp += 2;
1549                        dstp += 2;
1550                }
1551        } else {
1552                /* submit all but the first request, and ensure
1553                   that it is submitted below */
1554                mxge_submit_req_backwards(tx, src, cnt);
1555                i = 0;
1556        }
1557        if (i < cnt) {
1558                /* submit the first request */
1559                mxge_pio_copy(dstp, srcp, sizeof(*src));
1560                mb(); /* barrier before setting valid flag */
1561        }
1562
1563        /* re-write the last 32-bits with the valid flags */
1564        src->flags = last_flags;
1565        src_ints = (uint32_t *)src;
1566        src_ints+=3;
1567        dst_ints = (volatile uint32_t *)dst;
1568        dst_ints+=3;
1569        *dst_ints =  *src_ints;
1570        tx->req += cnt;
1571        mb();
1572}
1573
1574static void
1575mxge_encap_tso(mxge_softc_t *sc, struct mbuf *m, int busdma_seg_cnt,
1576	       int ip_off)
1577{
1578	mxge_tx_buf_t *tx;
1579	mcp_kreq_ether_send_t *req;
1580	bus_dma_segment_t *seg;
1581	struct ip *ip;
1582	struct tcphdr *tcp;
1583	uint32_t low, high_swapped;
1584	int len, seglen, cum_len, cum_len_next;
1585	int next_is_first, chop, cnt, rdma_count, small;
1586	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1587	uint8_t flags, flags_next;
1588	static int once;
1589
1590	mss = m->m_pkthdr.tso_segsz;
1591
1592	/* negative cum_len signifies to the
1593	 * send loop that we are still in the
1594	 * header portion of the TSO packet.
1595	 */
1596
1597	/* ensure we have the ethernet, IP and TCP
1598	   header together in the first mbuf, copy
1599	   it to a scratch buffer if not */
1600	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1601		m_copydata(m, 0, ip_off + sizeof (*ip),
1602			   sc->scratch);
1603		ip = (struct ip *)(sc->scratch + ip_off);
1604	} else {
1605		ip = (struct ip *)(mtod(m, char *) + ip_off);
1606	}
1607	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1608			    + sizeof (*tcp))) {
1609		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1610			   + sizeof (*tcp),  sc->scratch);
1611		ip = (struct ip *)(mtod(m, char *) + ip_off);
1612	}
1613
1614	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1615	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1616
1617	/* TSO implies checksum offload on this hardware */
1618	cksum_offset = ip_off + (ip->ip_hl << 2);
1619	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1620
1621
1622	/* for TSO, pseudo_hdr_offset holds mss.
1623	 * The firmware figures out where to put
1624	 * the checksum by parsing the header. */
1625	pseudo_hdr_offset = htobe16(mss);
1626
1627	tx = &sc->tx;
1628	req = tx->req_list;
1629	seg = tx->seg_list;
1630	cnt = 0;
1631	rdma_count = 0;
1632	/* "rdma_count" is the number of RDMAs belonging to the
1633	 * current packet BEFORE the current send request. For
1634	 * non-TSO packets, this is equal to "count".
1635	 * For TSO packets, rdma_count needs to be reset
1636	 * to 0 after a segment cut.
1637	 *
1638	 * The rdma_count field of the send request is
1639	 * the number of RDMAs of the packet starting at
1640	 * that request. For TSO send requests with one ore more cuts
1641	 * in the middle, this is the number of RDMAs starting
1642	 * after the last cut in the request. All previous
1643	 * segments before the last cut implicitly have 1 RDMA.
1644	 *
1645	 * Since the number of RDMAs is not known beforehand,
1646	 * it must be filled-in retroactively - after each
1647	 * segmentation cut or at the end of the entire packet.
1648	 */
1649
1650	while (busdma_seg_cnt) {
1651		/* Break the busdma segment up into pieces*/
1652		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1653		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1654		len = seg->ds_len;
1655
1656		while (len) {
1657			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1658			seglen = len;
1659			cum_len_next = cum_len + seglen;
1660			(req-rdma_count)->rdma_count = rdma_count + 1;
1661			if (__predict_true(cum_len >= 0)) {
1662				/* payload */
1663				chop = (cum_len_next > mss);
1664				cum_len_next = cum_len_next % mss;
1665				next_is_first = (cum_len_next == 0);
1666				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1667				flags_next |= next_is_first *
1668					MXGEFW_FLAGS_FIRST;
1669				rdma_count |= -(chop | next_is_first);
1670				rdma_count += chop & !next_is_first;
1671			} else if (cum_len_next >= 0) {
1672				/* header ends */
1673				rdma_count = -1;
1674				cum_len_next = 0;
1675				seglen = -cum_len;
1676				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1677				flags_next = MXGEFW_FLAGS_TSO_PLD |
1678					MXGEFW_FLAGS_FIRST |
1679					(small * MXGEFW_FLAGS_SMALL);
1680			    }
1681
1682			req->addr_high = high_swapped;
1683			req->addr_low = htobe32(low);
1684			req->pseudo_hdr_offset = pseudo_hdr_offset;
1685			req->pad = 0;
1686			req->rdma_count = 1;
1687			req->length = htobe16(seglen);
1688			req->cksum_offset = cksum_offset;
1689			req->flags = flags | ((cum_len & 1) *
1690					      MXGEFW_FLAGS_ALIGN_ODD);
1691			low += seglen;
1692			len -= seglen;
1693			cum_len = cum_len_next;
1694			flags = flags_next;
1695			req++;
1696			cnt++;
1697			rdma_count++;
1698			if (__predict_false(cksum_offset > seglen))
1699				cksum_offset -= seglen;
1700			else
1701				cksum_offset = 0;
1702			if (__predict_false(cnt > tx->max_desc))
1703				goto drop;
1704		}
1705		busdma_seg_cnt--;
1706		seg++;
1707	}
1708	(req-rdma_count)->rdma_count = rdma_count;
1709
1710	do {
1711		req--;
1712		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1713	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1714
1715	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1716	mxge_submit_req(tx, tx->req_list, cnt);
1717	return;
1718
1719drop:
1720	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1721	m_freem(m);
1722	sc->ifp->if_oerrors++;
1723	if (!once) {
1724		printf("tx->max_desc exceeded via TSO!\n");
1725		printf("mss = %d, %ld, %d!\n", mss,
1726		       (long)seg - (long)tx->seg_list, tx->max_desc);
1727		once = 1;
1728	}
1729	return;
1730
1731}
1732
1733/*
1734 * We reproduce the software vlan tag insertion from
1735 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1736 * vlan tag insertion. We need to advertise this in order to have the
1737 * vlan interface respect our csum offload flags.
1738 */
1739static struct mbuf *
1740mxge_vlan_tag_insert(struct mbuf *m)
1741{
1742	struct ether_vlan_header *evl;
1743
1744	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1745	if (__predict_false(m == NULL))
1746		return NULL;
1747	if (m->m_len < sizeof(*evl)) {
1748		m = m_pullup(m, sizeof(*evl));
1749		if (__predict_false(m == NULL))
1750			return NULL;
1751	}
1752	/*
1753	 * Transform the Ethernet header into an Ethernet header
1754	 * with 802.1Q encapsulation.
1755	 */
1756	evl = mtod(m, struct ether_vlan_header *);
1757	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1758	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1759	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1760	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1761	m->m_flags &= ~M_VLANTAG;
1762	return m;
1763}
1764
1765static void
1766mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1767{
1768	mcp_kreq_ether_send_t *req;
1769	bus_dma_segment_t *seg;
1770	struct mbuf *m_tmp;
1771	struct ifnet *ifp;
1772	mxge_tx_buf_t *tx;
1773	struct ip *ip;
1774	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1775	uint16_t pseudo_hdr_offset;
1776        uint8_t flags, cksum_offset;
1777
1778
1779
1780	ifp = sc->ifp;
1781	tx = &sc->tx;
1782
1783	ip_off = sizeof (struct ether_header);
1784	if (m->m_flags & M_VLANTAG) {
1785		m = mxge_vlan_tag_insert(m);
1786		if (__predict_false(m == NULL))
1787			goto drop;
1788		ip_off += ETHER_VLAN_ENCAP_LEN;
1789	}
1790
1791	/* (try to) map the frame for DMA */
1792	idx = tx->req & tx->mask;
1793	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1794				      m, tx->seg_list, &cnt,
1795				      BUS_DMA_NOWAIT);
1796	if (__predict_false(err == EFBIG)) {
1797		/* Too many segments in the chain.  Try
1798		   to defrag */
1799		m_tmp = m_defrag(m, M_NOWAIT);
1800		if (m_tmp == NULL) {
1801			goto drop;
1802		}
1803		sc->tx_defrag++;
1804		m = m_tmp;
1805		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1806					      tx->info[idx].map,
1807					      m, tx->seg_list, &cnt,
1808					      BUS_DMA_NOWAIT);
1809	}
1810	if (__predict_false(err != 0)) {
1811		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1812			      " packet len = %d\n", err, m->m_pkthdr.len);
1813		goto drop;
1814	}
1815	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1816			BUS_DMASYNC_PREWRITE);
1817	tx->info[idx].m = m;
1818
1819
1820	/* TSO is different enough, we handle it in another routine */
1821	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1822		mxge_encap_tso(sc, m, cnt, ip_off);
1823		return;
1824	}
1825
1826	req = tx->req_list;
1827	cksum_offset = 0;
1828	pseudo_hdr_offset = 0;
1829	flags = MXGEFW_FLAGS_NO_TSO;
1830
1831	/* checksum offloading? */
1832	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1833		/* ensure ip header is in first mbuf, copy
1834		   it to a scratch buffer if not */
1835		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1836			m_copydata(m, 0, ip_off + sizeof (*ip),
1837				   sc->scratch);
1838			ip = (struct ip *)(sc->scratch + ip_off);
1839		} else {
1840			ip = (struct ip *)(mtod(m, char *) + ip_off);
1841		}
1842		cksum_offset = ip_off + (ip->ip_hl << 2);
1843		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1844		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1845		req->cksum_offset = cksum_offset;
1846		flags |= MXGEFW_FLAGS_CKSUM;
1847		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1848	} else {
1849		odd_flag = 0;
1850	}
1851	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1852		flags |= MXGEFW_FLAGS_SMALL;
1853
1854	/* convert segments into a request list */
1855	cum_len = 0;
1856	seg = tx->seg_list;
1857	req->flags = MXGEFW_FLAGS_FIRST;
1858	for (i = 0; i < cnt; i++) {
1859		req->addr_low =
1860			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1861		req->addr_high =
1862			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1863		req->length = htobe16(seg->ds_len);
1864		req->cksum_offset = cksum_offset;
1865		if (cksum_offset > seg->ds_len)
1866			cksum_offset -= seg->ds_len;
1867		else
1868			cksum_offset = 0;
1869		req->pseudo_hdr_offset = pseudo_hdr_offset;
1870		req->pad = 0; /* complete solid 16-byte block */
1871		req->rdma_count = 1;
1872		req->flags |= flags | ((cum_len & 1) * odd_flag);
1873		cum_len += seg->ds_len;
1874		seg++;
1875		req++;
1876		req->flags = 0;
1877	}
1878	req--;
1879	/* pad runts to 60 bytes */
1880	if (cum_len < 60) {
1881		req++;
1882		req->addr_low =
1883			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1884		req->addr_high =
1885			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1886		req->length = htobe16(60 - cum_len);
1887		req->cksum_offset = 0;
1888		req->pseudo_hdr_offset = pseudo_hdr_offset;
1889		req->pad = 0; /* complete solid 16-byte block */
1890		req->rdma_count = 1;
1891		req->flags |= flags | ((cum_len & 1) * odd_flag);
1892		cnt++;
1893	}
1894
1895	tx->req_list[0].rdma_count = cnt;
1896#if 0
1897	/* print what the firmware will see */
1898	for (i = 0; i < cnt; i++) {
1899		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1900		    "cso:%d, flags:0x%x, rdma:%d\n",
1901		    i, (int)ntohl(tx->req_list[i].addr_high),
1902		    (int)ntohl(tx->req_list[i].addr_low),
1903		    (int)ntohs(tx->req_list[i].length),
1904		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1905		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1906		    tx->req_list[i].rdma_count);
1907	}
1908	printf("--------------\n");
1909#endif
1910	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1911	mxge_submit_req(tx, tx->req_list, cnt);
1912	return;
1913
1914drop:
1915	m_freem(m);
1916	ifp->if_oerrors++;
1917	return;
1918}
1919
1920
1921
1922
1923static inline void
1924mxge_start_locked(mxge_softc_t *sc)
1925{
1926	struct mbuf *m;
1927	struct ifnet *ifp;
1928	mxge_tx_buf_t *tx;
1929
1930	ifp = sc->ifp;
1931	tx = &sc->tx;
1932	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
1933		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1934		if (m == NULL) {
1935			return;
1936		}
1937		/* let BPF see it */
1938		BPF_MTAP(ifp, m);
1939
1940		/* give it to the nic */
1941		mxge_encap(sc, m);
1942	}
1943	/* ran out of transmit slots */
1944	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
1945		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1946		tx->stall++;
1947	}
1948}
1949
1950static void
1951mxge_start(struct ifnet *ifp)
1952{
1953	mxge_softc_t *sc = ifp->if_softc;
1954
1955
1956	mtx_lock(&sc->tx_mtx);
1957	mxge_start_locked(sc);
1958	mtx_unlock(&sc->tx_mtx);
1959}
1960
1961/*
1962 * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1963 * at most 32 bytes at a time, so as to avoid involving the software
1964 * pio handler in the nic.   We re-write the first segment's low
1965 * DMA address to mark it valid only after we write the entire chunk
1966 * in a burst
1967 */
1968static inline void
1969mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1970		mcp_kreq_ether_recv_t *src)
1971{
1972	uint32_t low;
1973
1974	low = src->addr_low;
1975	src->addr_low = 0xffffffff;
1976	mxge_pio_copy(dst, src, 4 * sizeof (*src));
1977	mb();
1978	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
1979	mb();
1980	src->addr_low = low;
1981	dst->addr_low = low;
1982	mb();
1983}
1984
1985static int
1986mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1987{
1988	bus_dma_segment_t seg;
1989	struct mbuf *m;
1990	mxge_rx_buf_t *rx = &sc->rx_small;
1991	int cnt, err;
1992
1993	m = m_gethdr(M_DONTWAIT, MT_DATA);
1994	if (m == NULL) {
1995		rx->alloc_fail++;
1996		err = ENOBUFS;
1997		goto done;
1998	}
1999	m->m_len = MHLEN;
2000	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2001				      &seg, &cnt, BUS_DMA_NOWAIT);
2002	if (err != 0) {
2003		m_free(m);
2004		goto done;
2005	}
2006	rx->info[idx].m = m;
2007	rx->shadow[idx].addr_low =
2008		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2009	rx->shadow[idx].addr_high =
2010		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2011
2012done:
2013	if ((idx & 7) == 7)
2014		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2015	return err;
2016}
2017
2018static int
2019mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
2020{
2021	bus_dma_segment_t seg[3];
2022	struct mbuf *m;
2023	mxge_rx_buf_t *rx = &sc->rx_big;
2024	int cnt, err, i;
2025
2026	if (rx->cl_size == MCLBYTES)
2027		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2028	else
2029		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2030	if (m == NULL) {
2031		rx->alloc_fail++;
2032		err = ENOBUFS;
2033		goto done;
2034	}
2035	m->m_len = rx->cl_size;
2036	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2037				      seg, &cnt, BUS_DMA_NOWAIT);
2038	if (err != 0) {
2039		m_free(m);
2040		goto done;
2041	}
2042	rx->info[idx].m = m;
2043
2044	for (i = 0; i < cnt; i++) {
2045		rx->shadow[idx + i].addr_low =
2046			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2047		rx->shadow[idx + i].addr_high =
2048			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2049       }
2050
2051
2052done:
2053       for (i = 0; i < rx->nbufs; i++) {
2054		if ((idx & 7) == 7) {
2055			mxge_submit_8rx(&rx->lanai[idx - 7],
2056					&rx->shadow[idx - 7]);
2057		}
2058		idx++;
2059	}
2060	return err;
2061}
2062
2063/*
2064 *  Myri10GE hardware checksums are not valid if the sender
2065 *  padded the frame with non-zero padding.  This is because
2066 *  the firmware just does a simple 16-bit 1s complement
2067 *  checksum across the entire frame, excluding the first 14
2068 *  bytes.  It is best to simply to check the checksum and
2069 *  tell the stack about it only if the checksum is good
2070 */
2071
2072static inline uint16_t
2073mxge_rx_csum(struct mbuf *m, int csum)
2074{
2075	struct ether_header *eh;
2076	struct ip *ip;
2077	uint16_t c;
2078
2079	eh = mtod(m, struct ether_header *);
2080
2081	/* only deal with IPv4 TCP & UDP for now */
2082	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2083		return 1;
2084	ip = (struct ip *)(eh + 1);
2085	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2086			    ip->ip_p != IPPROTO_UDP))
2087		return 1;
2088
2089	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2090		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2091			    - (ip->ip_hl << 2) + ip->ip_p));
2092	c ^= 0xffff;
2093	return (c);
2094}
2095
2096static void
2097mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2098{
2099	struct ether_vlan_header *evl;
2100	struct ether_header *eh;
2101	uint32_t partial;
2102
2103	evl = mtod(m, struct ether_vlan_header *);
2104	eh = mtod(m, struct ether_header *);
2105
2106	/*
2107	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2108	 * after what the firmware thought was the end of the ethernet
2109	 * header.
2110	 */
2111
2112	/* put checksum into host byte order */
2113	*csum = ntohs(*csum);
2114	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2115	(*csum) += ~partial;
2116	(*csum) +=  ((*csum) < ~partial);
2117	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2118	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2119
2120	/* restore checksum to network byte order;
2121	   later consumers expect this */
2122	*csum = htons(*csum);
2123
2124	/* save the tag */
2125	m->m_flags |= M_VLANTAG;
2126	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2127
2128	/*
2129	 * Remove the 802.1q header by copying the Ethernet
2130	 * addresses over it and adjusting the beginning of
2131	 * the data in the mbuf.  The encapsulated Ethernet
2132	 * type field is already in place.
2133	 */
2134	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2135	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2136	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2137}
2138
2139
2140static inline void
2141mxge_rx_done_big(mxge_softc_t *sc, uint32_t len, uint32_t csum)
2142{
2143	struct ifnet *ifp;
2144	struct mbuf *m;
2145	struct ether_header *eh;
2146	mxge_rx_buf_t *rx;
2147	bus_dmamap_t old_map;
2148	int idx;
2149	uint16_t tcpudp_csum;
2150
2151	ifp = sc->ifp;
2152	rx = &sc->rx_big;
2153	idx = rx->cnt & rx->mask;
2154	rx->cnt += rx->nbufs;
2155	/* save a pointer to the received mbuf */
2156	m = rx->info[idx].m;
2157	/* try to replace the received mbuf */
2158	if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
2159		/* drop the frame -- the old mbuf is re-cycled */
2160		ifp->if_ierrors++;
2161		return;
2162	}
2163
2164	/* unmap the received buffer */
2165	old_map = rx->info[idx].map;
2166	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2167	bus_dmamap_unload(rx->dmat, old_map);
2168
2169	/* swap the bus_dmamap_t's */
2170	rx->info[idx].map = rx->extra_map;
2171	rx->extra_map = old_map;
2172
2173	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2174	 * aligned */
2175	m->m_data += MXGEFW_PAD;
2176
2177	m->m_pkthdr.rcvif = ifp;
2178	m->m_len = m->m_pkthdr.len = len;
2179	ifp->if_ipackets++;
2180	eh = mtod(m, struct ether_header *);
2181	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2182		mxge_vlan_tag_remove(m, &csum);
2183	}
2184	/* if the checksum is valid, mark it in the mbuf header */
2185	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2186		if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
2187			return;
2188		/* otherwise, it was a UDP frame, or a TCP frame which
2189		   we could not do LRO on.  Tell the stack that the
2190		   checksum is good */
2191		m->m_pkthdr.csum_data = 0xffff;
2192		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2193	}
2194	/* pass the frame up the stack */
2195	(*ifp->if_input)(ifp, m);
2196}
2197
2198static inline void
2199mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
2200{
2201	struct ifnet *ifp;
2202	struct ether_header *eh;
2203	struct mbuf *m;
2204	mxge_rx_buf_t *rx;
2205	bus_dmamap_t old_map;
2206	int idx;
2207	uint16_t tcpudp_csum;
2208
2209	ifp = sc->ifp;
2210	rx = &sc->rx_small;
2211	idx = rx->cnt & rx->mask;
2212	rx->cnt++;
2213	/* save a pointer to the received mbuf */
2214	m = rx->info[idx].m;
2215	/* try to replace the received mbuf */
2216	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
2217		/* drop the frame -- the old mbuf is re-cycled */
2218		ifp->if_ierrors++;
2219		return;
2220	}
2221
2222	/* unmap the received buffer */
2223	old_map = rx->info[idx].map;
2224	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2225	bus_dmamap_unload(rx->dmat, old_map);
2226
2227	/* swap the bus_dmamap_t's */
2228	rx->info[idx].map = rx->extra_map;
2229	rx->extra_map = old_map;
2230
2231	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2232	 * aligned */
2233	m->m_data += MXGEFW_PAD;
2234
2235	m->m_pkthdr.rcvif = ifp;
2236	m->m_len = m->m_pkthdr.len = len;
2237	ifp->if_ipackets++;
2238	eh = mtod(m, struct ether_header *);
2239	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2240		mxge_vlan_tag_remove(m, &csum);
2241	}
2242	/* if the checksum is valid, mark it in the mbuf header */
2243	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2244		if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
2245			return;
2246		/* otherwise, it was a UDP frame, or a TCP frame which
2247		   we could not do LRO on.  Tell the stack that the
2248		   checksum is good */
2249		m->m_pkthdr.csum_data = 0xffff;
2250		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2251	}
2252
2253	/* pass the frame up the stack */
2254	(*ifp->if_input)(ifp, m);
2255}
2256
2257static inline void
2258mxge_clean_rx_done(mxge_softc_t *sc)
2259{
2260	mxge_rx_done_t *rx_done = &sc->rx_done;
2261	struct lro_entry *lro;
2262	int limit = 0;
2263	uint16_t length;
2264	uint16_t checksum;
2265
2266
2267	while (rx_done->entry[rx_done->idx].length != 0) {
2268		length = ntohs(rx_done->entry[rx_done->idx].length);
2269		rx_done->entry[rx_done->idx].length = 0;
2270		checksum = rx_done->entry[rx_done->idx].checksum;
2271		if (length <= (MHLEN - MXGEFW_PAD))
2272			mxge_rx_done_small(sc, length, checksum);
2273		else
2274			mxge_rx_done_big(sc, length, checksum);
2275		rx_done->cnt++;
2276		rx_done->idx = rx_done->cnt & rx_done->mask;
2277
2278		/* limit potential for livelock */
2279		if (__predict_false(++limit > rx_done->mask / 2))
2280			break;
2281	}
2282	while(!SLIST_EMPTY(&sc->lro_active)) {
2283		lro = SLIST_FIRST(&sc->lro_active);
2284		SLIST_REMOVE_HEAD(&sc->lro_active, next);
2285		mxge_lro_flush(sc, lro);
2286	}
2287}
2288
2289
2290static inline void
2291mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
2292{
2293	struct ifnet *ifp;
2294	mxge_tx_buf_t *tx;
2295	struct mbuf *m;
2296	bus_dmamap_t map;
2297	int idx;
2298
2299	tx = &sc->tx;
2300	ifp = sc->ifp;
2301	while (tx->pkt_done != mcp_idx) {
2302		idx = tx->done & tx->mask;
2303		tx->done++;
2304		m = tx->info[idx].m;
2305		/* mbuf and DMA map only attached to the first
2306		   segment per-mbuf */
2307		if (m != NULL) {
2308			ifp->if_opackets++;
2309			tx->info[idx].m = NULL;
2310			map = tx->info[idx].map;
2311			bus_dmamap_unload(tx->dmat, map);
2312			m_freem(m);
2313		}
2314		if (tx->info[idx].flag) {
2315			tx->info[idx].flag = 0;
2316			tx->pkt_done++;
2317		}
2318	}
2319
2320	/* If we have space, clear IFF_OACTIVE to tell the stack that
2321           its OK to send packets */
2322
2323	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2324	    tx->req - tx->done < (tx->mask + 1)/4) {
2325		mtx_lock(&sc->tx_mtx);
2326		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2327		sc->tx.wake++;
2328		mxge_start_locked(sc);
2329		mtx_unlock(&sc->tx_mtx);
2330	}
2331}
2332
2333static void
2334mxge_intr(void *arg)
2335{
2336	mxge_softc_t *sc = arg;
2337	mcp_irq_data_t *stats = sc->fw_stats;
2338	mxge_tx_buf_t *tx = &sc->tx;
2339	mxge_rx_done_t *rx_done = &sc->rx_done;
2340	uint32_t send_done_count;
2341	uint8_t valid;
2342
2343
2344	/* make sure the DMA has finished */
2345	if (!stats->valid) {
2346		return;
2347	}
2348	valid = stats->valid;
2349
2350	if (!sc->msi_enabled) {
2351		/* lower legacy IRQ  */
2352		*sc->irq_deassert = 0;
2353		if (!mxge_deassert_wait)
2354			/* don't wait for conf. that irq is low */
2355			stats->valid = 0;
2356	} else {
2357		stats->valid = 0;
2358	}
2359
2360	/* loop while waiting for legacy irq deassertion */
2361	do {
2362		/* check for transmit completes and receives */
2363		send_done_count = be32toh(stats->send_done_count);
2364		while ((send_done_count != tx->pkt_done) ||
2365		       (rx_done->entry[rx_done->idx].length != 0)) {
2366			mxge_tx_done(sc, (int)send_done_count);
2367			mxge_clean_rx_done(sc);
2368			send_done_count = be32toh(stats->send_done_count);
2369		}
2370	} while (*((volatile uint8_t *) &stats->valid));
2371
2372	if (__predict_false(stats->stats_updated)) {
2373		if (sc->link_state != stats->link_up) {
2374			sc->link_state = stats->link_up;
2375			if (sc->link_state) {
2376				if_link_state_change(sc->ifp, LINK_STATE_UP);
2377				if (mxge_verbose)
2378					device_printf(sc->dev, "link up\n");
2379			} else {
2380				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2381				if (mxge_verbose)
2382					device_printf(sc->dev, "link down\n");
2383			}
2384		}
2385		if (sc->rdma_tags_available !=
2386		    be32toh(sc->fw_stats->rdma_tags_available)) {
2387			sc->rdma_tags_available =
2388				be32toh(sc->fw_stats->rdma_tags_available);
2389			device_printf(sc->dev, "RDMA timed out! %d tags "
2390				      "left\n", sc->rdma_tags_available);
2391		}
2392		sc->down_cnt += stats->link_down;
2393	}
2394
2395	/* check to see if we have rx token to pass back */
2396	if (valid & 0x1)
2397	    *sc->irq_claim = be32toh(3);
2398	*(sc->irq_claim + 1) = be32toh(3);
2399}
2400
2401static void
2402mxge_init(void *arg)
2403{
2404}
2405
2406
2407
2408static void
2409mxge_free_mbufs(mxge_softc_t *sc)
2410{
2411	int i;
2412
2413	for (i = 0; i <= sc->rx_big.mask; i++) {
2414		if (sc->rx_big.info[i].m == NULL)
2415			continue;
2416		bus_dmamap_unload(sc->rx_big.dmat,
2417				  sc->rx_big.info[i].map);
2418		m_freem(sc->rx_big.info[i].m);
2419		sc->rx_big.info[i].m = NULL;
2420	}
2421
2422	for (i = 0; i <= sc->rx_small.mask; i++) {
2423		if (sc->rx_small.info[i].m == NULL)
2424			continue;
2425		bus_dmamap_unload(sc->rx_small.dmat,
2426				  sc->rx_small.info[i].map);
2427		m_freem(sc->rx_small.info[i].m);
2428		sc->rx_small.info[i].m = NULL;
2429	}
2430
2431	for (i = 0; i <= sc->tx.mask; i++) {
2432		sc->tx.info[i].flag = 0;
2433		if (sc->tx.info[i].m == NULL)
2434			continue;
2435		bus_dmamap_unload(sc->tx.dmat,
2436				  sc->tx.info[i].map);
2437		m_freem(sc->tx.info[i].m);
2438		sc->tx.info[i].m = NULL;
2439	}
2440}
2441
2442static void
2443mxge_free_rings(mxge_softc_t *sc)
2444{
2445	int i;
2446
2447	if (sc->rx_done.entry != NULL)
2448		mxge_dma_free(&sc->rx_done.dma);
2449	sc->rx_done.entry = NULL;
2450	if (sc->tx.req_bytes != NULL)
2451		free(sc->tx.req_bytes, M_DEVBUF);
2452	if (sc->tx.seg_list != NULL)
2453		free(sc->tx.seg_list, M_DEVBUF);
2454	if (sc->rx_small.shadow != NULL)
2455		free(sc->rx_small.shadow, M_DEVBUF);
2456	if (sc->rx_big.shadow != NULL)
2457		free(sc->rx_big.shadow, M_DEVBUF);
2458	if (sc->tx.info != NULL) {
2459		if (sc->tx.dmat != NULL) {
2460			for (i = 0; i <= sc->tx.mask; i++) {
2461				bus_dmamap_destroy(sc->tx.dmat,
2462						   sc->tx.info[i].map);
2463			}
2464			bus_dma_tag_destroy(sc->tx.dmat);
2465		}
2466		free(sc->tx.info, M_DEVBUF);
2467	}
2468	if (sc->rx_small.info != NULL) {
2469		if (sc->rx_small.dmat != NULL) {
2470			for (i = 0; i <= sc->rx_small.mask; i++) {
2471				bus_dmamap_destroy(sc->rx_small.dmat,
2472						   sc->rx_small.info[i].map);
2473			}
2474			bus_dmamap_destroy(sc->rx_small.dmat,
2475					   sc->rx_small.extra_map);
2476			bus_dma_tag_destroy(sc->rx_small.dmat);
2477		}
2478		free(sc->rx_small.info, M_DEVBUF);
2479	}
2480	if (sc->rx_big.info != NULL) {
2481		if (sc->rx_big.dmat != NULL) {
2482			for (i = 0; i <= sc->rx_big.mask; i++) {
2483				bus_dmamap_destroy(sc->rx_big.dmat,
2484						   sc->rx_big.info[i].map);
2485			}
2486			bus_dmamap_destroy(sc->rx_big.dmat,
2487					   sc->rx_big.extra_map);
2488			bus_dma_tag_destroy(sc->rx_big.dmat);
2489		}
2490		free(sc->rx_big.info, M_DEVBUF);
2491	}
2492}
2493
2494static int
2495mxge_alloc_rings(mxge_softc_t *sc)
2496{
2497	mxge_cmd_t cmd;
2498	int tx_ring_size, rx_ring_size;
2499	int tx_ring_entries, rx_ring_entries;
2500	int i, err;
2501	unsigned long bytes;
2502
2503	/* get ring sizes */
2504	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2505	tx_ring_size = cmd.data0;
2506	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
2507	if (err != 0) {
2508		device_printf(sc->dev, "Cannot determine ring sizes\n");
2509		goto abort_with_nothing;
2510	}
2511
2512	rx_ring_size = cmd.data0;
2513
2514	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
2515	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
2516	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
2517	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
2518	IFQ_SET_READY(&sc->ifp->if_snd);
2519
2520	sc->tx.mask = tx_ring_entries - 1;
2521	sc->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
2522	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
2523	sc->rx_done.mask = (2 * rx_ring_entries) - 1;
2524
2525	err = ENOMEM;
2526
2527	/* allocate interrupt queues */
2528	bytes = (sc->rx_done.mask + 1) * sizeof (*sc->rx_done.entry);
2529	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
2530	if (err != 0)
2531		goto abort_with_nothing;
2532	sc->rx_done.entry = sc->rx_done.dma.addr;
2533	bzero(sc->rx_done.entry, bytes);
2534
2535	/* allocate the tx request copy block */
2536	bytes = 8 +
2537		sizeof (*sc->tx.req_list) * (sc->tx.max_desc + 4);
2538	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2539	if (sc->tx.req_bytes == NULL)
2540		goto abort_with_alloc;
2541	/* ensure req_list entries are aligned to 8 bytes */
2542	sc->tx.req_list = (mcp_kreq_ether_send_t *)
2543		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
2544
2545	/* allocate the tx busdma segment list */
2546	bytes = sizeof (*sc->tx.seg_list) * sc->tx.max_desc;
2547	sc->tx.seg_list = (bus_dma_segment_t *)
2548		malloc(bytes, M_DEVBUF, M_WAITOK);
2549	if (sc->tx.seg_list == NULL)
2550		goto abort_with_alloc;
2551
2552	/* allocate the rx shadow rings */
2553	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
2554	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2555	if (sc->rx_small.shadow == NULL)
2556		goto abort_with_alloc;
2557
2558	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
2559	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2560	if (sc->rx_big.shadow == NULL)
2561		goto abort_with_alloc;
2562
2563	/* allocate the host info rings */
2564	bytes = tx_ring_entries * sizeof (*sc->tx.info);
2565	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2566	if (sc->tx.info == NULL)
2567		goto abort_with_alloc;
2568
2569	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
2570	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2571	if (sc->rx_small.info == NULL)
2572		goto abort_with_alloc;
2573
2574	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
2575	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2576	if (sc->rx_big.info == NULL)
2577		goto abort_with_alloc;
2578
2579	/* allocate the busdma resources */
2580	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2581				 1,			/* alignment */
2582				 sc->tx.boundary,	/* boundary */
2583				 BUS_SPACE_MAXADDR,	/* low */
2584				 BUS_SPACE_MAXADDR,	/* high */
2585				 NULL, NULL,		/* filter */
2586				 65536 + 256,		/* maxsize */
2587				 sc->tx.max_desc - 2,	/* num segs */
2588				 sc->tx.boundary,	/* maxsegsize */
2589				 BUS_DMA_ALLOCNOW,	/* flags */
2590				 NULL, NULL,		/* lock */
2591				 &sc->tx.dmat);		/* tag */
2592
2593	if (err != 0) {
2594		device_printf(sc->dev, "Err %d allocating tx dmat\n",
2595			      err);
2596		goto abort_with_alloc;
2597	}
2598
2599	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2600				 1,			/* alignment */
2601				 4096,			/* boundary */
2602				 BUS_SPACE_MAXADDR,	/* low */
2603				 BUS_SPACE_MAXADDR,	/* high */
2604				 NULL, NULL,		/* filter */
2605				 MHLEN,			/* maxsize */
2606				 1,			/* num segs */
2607				 MHLEN,			/* maxsegsize */
2608				 BUS_DMA_ALLOCNOW,	/* flags */
2609				 NULL, NULL,		/* lock */
2610				 &sc->rx_small.dmat);	/* tag */
2611	if (err != 0) {
2612		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2613			      err);
2614		goto abort_with_alloc;
2615	}
2616
2617	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2618				 1,			/* alignment */
2619				 4096,			/* boundary */
2620				 BUS_SPACE_MAXADDR,	/* low */
2621				 BUS_SPACE_MAXADDR,	/* high */
2622				 NULL, NULL,		/* filter */
2623				 3*4096,		/* maxsize */
2624				 3,			/* num segs */
2625				 4096,			/* maxsegsize */
2626				 BUS_DMA_ALLOCNOW,	/* flags */
2627				 NULL, NULL,		/* lock */
2628				 &sc->rx_big.dmat);	/* tag */
2629	if (err != 0) {
2630		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2631			      err);
2632		goto abort_with_alloc;
2633	}
2634
2635	/* now use these tags to setup dmamaps for each slot
2636	   in each ring */
2637	for (i = 0; i <= sc->tx.mask; i++) {
2638		err = bus_dmamap_create(sc->tx.dmat, 0,
2639					&sc->tx.info[i].map);
2640		if (err != 0) {
2641			device_printf(sc->dev, "Err %d  tx dmamap\n",
2642			      err);
2643			goto abort_with_alloc;
2644		}
2645	}
2646	for (i = 0; i <= sc->rx_small.mask; i++) {
2647		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2648					&sc->rx_small.info[i].map);
2649		if (err != 0) {
2650			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2651				      err);
2652			goto abort_with_alloc;
2653		}
2654	}
2655	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2656				&sc->rx_small.extra_map);
2657	if (err != 0) {
2658		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2659			      err);
2660			goto abort_with_alloc;
2661	}
2662
2663	for (i = 0; i <= sc->rx_big.mask; i++) {
2664		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2665					&sc->rx_big.info[i].map);
2666		if (err != 0) {
2667			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2668			      err);
2669			goto abort_with_alloc;
2670		}
2671	}
2672	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2673				&sc->rx_big.extra_map);
2674	if (err != 0) {
2675		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2676			      err);
2677			goto abort_with_alloc;
2678	}
2679	return 0;
2680
2681abort_with_alloc:
2682	mxge_free_rings(sc);
2683
2684abort_with_nothing:
2685	return err;
2686}
2687
2688static void
2689mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
2690{
2691	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
2692
2693	if (bufsize < MCLBYTES) {
2694		/* easy, everything fits in a single buffer */
2695		*big_buf_size = MCLBYTES;
2696		*cl_size = MCLBYTES;
2697		*nbufs = 1;
2698		return;
2699	}
2700
2701	if (bufsize < MJUMPAGESIZE) {
2702		/* still easy, everything still fits in a single buffer */
2703		*big_buf_size = MJUMPAGESIZE;
2704		*cl_size = MJUMPAGESIZE;
2705		*nbufs = 1;
2706		return;
2707	}
2708	/* now we need to use virtually contiguous buffers */
2709	*cl_size = MJUM9BYTES;
2710	*big_buf_size = 4096;
2711	*nbufs = mtu / 4096 + 1;
2712	/* needs to be a power of two, so round up */
2713	if (*nbufs == 3)
2714		*nbufs = 4;
2715}
2716
2717static int
2718mxge_open(mxge_softc_t *sc)
2719{
2720	mxge_cmd_t cmd;
2721	int i, err, big_bytes;
2722	bus_dmamap_t map;
2723	bus_addr_t bus;
2724	struct lro_entry *lro_entry;
2725
2726	SLIST_INIT(&sc->lro_free);
2727	SLIST_INIT(&sc->lro_active);
2728
2729	for (i = 0; i < sc->lro_cnt; i++) {
2730		lro_entry = (struct lro_entry *)
2731			malloc(sizeof (*lro_entry), M_DEVBUF, M_NOWAIT | M_ZERO);
2732		if (lro_entry == NULL) {
2733			sc->lro_cnt = i;
2734			break;
2735		}
2736		SLIST_INSERT_HEAD(&sc->lro_free, lro_entry, next);
2737	}
2738
2739	/* Copy the MAC address in case it was overridden */
2740	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
2741
2742	err = mxge_reset(sc, 1);
2743	if (err != 0) {
2744		device_printf(sc->dev, "failed to reset\n");
2745		return EIO;
2746	}
2747
2748	mxge_choose_params(sc->ifp->if_mtu, &big_bytes,
2749			   &sc->rx_big.cl_size, &sc->rx_big.nbufs);
2750
2751	cmd.data0 = sc->rx_big.nbufs;
2752	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
2753			    &cmd);
2754	/* error is only meaningful if we're trying to set
2755	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
2756	if (err && sc->rx_big.nbufs > 1) {
2757		device_printf(sc->dev,
2758			      "Failed to set alway-use-n to %d\n",
2759			      sc->rx_big.nbufs);
2760		return EIO;
2761	}
2762	/* get the lanai pointers to the send and receive rings */
2763
2764	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2765	sc->tx.lanai =
2766		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2767	err |= mxge_send_cmd(sc,
2768				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2769	sc->rx_small.lanai =
2770		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2771	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2772	sc->rx_big.lanai =
2773		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2774
2775	if (err != 0) {
2776		device_printf(sc->dev,
2777			      "failed to get ring sizes or locations\n");
2778		return EIO;
2779	}
2780
2781	/* stock receive rings */
2782	for (i = 0; i <= sc->rx_small.mask; i++) {
2783		map = sc->rx_small.info[i].map;
2784		err = mxge_get_buf_small(sc, map, i);
2785		if (err) {
2786			device_printf(sc->dev, "alloced %d/%d smalls\n",
2787				      i, sc->rx_small.mask + 1);
2788			goto abort;
2789		}
2790	}
2791	for (i = 0; i <= sc->rx_big.mask; i++) {
2792		sc->rx_big.shadow[i].addr_low = 0xffffffff;
2793		sc->rx_big.shadow[i].addr_high = 0xffffffff;
2794	}
2795	for (i = 0; i <= sc->rx_big.mask; i += sc->rx_big.nbufs) {
2796		map = sc->rx_big.info[i].map;
2797		err = mxge_get_buf_big(sc, map, i);
2798		if (err) {
2799			device_printf(sc->dev, "alloced %d/%d bigs\n",
2800				      i, sc->rx_big.mask + 1);
2801			goto abort;
2802		}
2803	}
2804
2805	/* Give the firmware the mtu and the big and small buffer
2806	   sizes.  The firmware wants the big buf size to be a power
2807	   of two. Luckily, FreeBSD's clusters are powers of two */
2808	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
2809	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2810	cmd.data0 = MHLEN - MXGEFW_PAD;
2811	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2812			     &cmd);
2813	cmd.data0 = big_bytes;
2814	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2815
2816	if (err != 0) {
2817		device_printf(sc->dev, "failed to setup params\n");
2818		goto abort;
2819	}
2820
2821	/* Now give him the pointer to the stats block */
2822	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
2823	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
2824	cmd.data2 = sizeof(struct mcp_irq_data);
2825	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
2826
2827	if (err != 0) {
2828		bus = sc->fw_stats_dma.bus_addr;
2829		bus += offsetof(struct mcp_irq_data, send_done_count);
2830		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
2831		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
2832		err = mxge_send_cmd(sc,
2833				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
2834				    &cmd);
2835		/* Firmware cannot support multicast without STATS_DMA_V2 */
2836		sc->fw_multicast_support = 0;
2837	} else {
2838		sc->fw_multicast_support = 1;
2839	}
2840
2841	if (err != 0) {
2842		device_printf(sc->dev, "failed to setup params\n");
2843		goto abort;
2844	}
2845
2846	/* Finally, start the firmware running */
2847	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
2848	if (err) {
2849		device_printf(sc->dev, "Couldn't bring up link\n");
2850		goto abort;
2851	}
2852	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
2853	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2854
2855	return 0;
2856
2857
2858abort:
2859	mxge_free_mbufs(sc);
2860
2861	return err;
2862}
2863
2864static int
2865mxge_close(mxge_softc_t *sc)
2866{
2867	struct lro_entry *lro_entry;
2868	mxge_cmd_t cmd;
2869	int err, old_down_cnt;
2870
2871	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2872	old_down_cnt = sc->down_cnt;
2873	mb();
2874	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2875	if (err) {
2876		device_printf(sc->dev, "Couldn't bring down link\n");
2877	}
2878	if (old_down_cnt == sc->down_cnt) {
2879		/* wait for down irq */
2880		DELAY(10 * sc->intr_coal_delay);
2881	}
2882	if (old_down_cnt == sc->down_cnt) {
2883		device_printf(sc->dev, "never got down irq\n");
2884	}
2885
2886	mxge_free_mbufs(sc);
2887
2888	while (!SLIST_EMPTY(&sc->lro_free)) {
2889		lro_entry = SLIST_FIRST(&sc->lro_free);
2890		SLIST_REMOVE_HEAD(&sc->lro_free, next);
2891	}
2892	return 0;
2893}
2894
2895static void
2896mxge_setup_cfg_space(mxge_softc_t *sc)
2897{
2898	device_t dev = sc->dev;
2899	int reg;
2900	uint16_t cmd, lnk, pectl;
2901
2902	/* find the PCIe link width and set max read request to 4KB*/
2903	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
2904		lnk = pci_read_config(dev, reg + 0x12, 2);
2905		sc->link_width = (lnk >> 4) & 0x3f;
2906
2907		pectl = pci_read_config(dev, reg + 0x8, 2);
2908		pectl = (pectl & ~0x7000) | (5 << 12);
2909		pci_write_config(dev, reg + 0x8, pectl, 2);
2910	}
2911
2912	/* Enable DMA and Memory space access */
2913	pci_enable_busmaster(dev);
2914	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
2915	cmd |= PCIM_CMD_MEMEN;
2916	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
2917}
2918
2919static uint32_t
2920mxge_read_reboot(mxge_softc_t *sc)
2921{
2922	device_t dev = sc->dev;
2923	uint32_t vs;
2924
2925	/* find the vendor specific offset */
2926	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
2927		device_printf(sc->dev,
2928			      "could not find vendor specific offset\n");
2929		return (uint32_t)-1;
2930	}
2931	/* enable read32 mode */
2932	pci_write_config(dev, vs + 0x10, 0x3, 1);
2933	/* tell NIC which register to read */
2934	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
2935	return (pci_read_config(dev, vs + 0x14, 4));
2936}
2937
2938static void
2939mxge_watchdog_reset(mxge_softc_t *sc)
2940{
2941	int err;
2942	uint32_t reboot;
2943	uint16_t cmd;
2944
2945	err = ENXIO;
2946
2947	device_printf(sc->dev, "Watchdog reset!\n");
2948
2949	/*
2950	 * check to see if the NIC rebooted.  If it did, then all of
2951	 * PCI config space has been reset, and things like the
2952	 * busmaster bit will be zero.  If this is the case, then we
2953	 * must restore PCI config space before the NIC can be used
2954	 * again
2955	 */
2956	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2957	if (cmd == 0xffff) {
2958		/*
2959		 * maybe the watchdog caught the NIC rebooting; wait
2960		 * up to 100ms for it to finish.  If it does not come
2961		 * back, then give up
2962		 */
2963		DELAY(1000*100);
2964		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2965		if (cmd == 0xffff) {
2966			device_printf(sc->dev, "NIC disappeared!\n");
2967			goto abort;
2968		}
2969	}
2970	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
2971		/* print the reboot status */
2972		reboot = mxge_read_reboot(sc);
2973		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
2974			      reboot);
2975		/* restore PCI configuration space */
2976
2977		/* XXXX waiting for pci_cfg_restore() to be exported */
2978		goto abort; /* just abort for now */
2979
2980		/* and redo any changes we made to our config space */
2981		mxge_setup_cfg_space(sc);
2982	} else {
2983		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
2984		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
2985			      sc->tx.req, sc->tx.done);
2986		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
2987			      sc->tx.pkt_done,
2988			      be32toh(sc->fw_stats->send_done_count));
2989	}
2990
2991	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
2992		mxge_close(sc);
2993		err = mxge_open(sc);
2994	}
2995
2996abort:
2997	/*
2998	 * stop the watchdog if the nic is dead, to avoid spamming the
2999	 * console
3000	 */
3001	if (err != 0) {
3002		callout_stop(&sc->co_hdl);
3003	}
3004}
3005
3006static void
3007mxge_watchdog(mxge_softc_t *sc)
3008{
3009	mxge_tx_buf_t *tx = &sc->tx;
3010
3011	/* see if we have outstanding transmits, which
3012	   have been pending for more than mxge_ticks */
3013	if (tx->req != tx->done &&
3014	    tx->watchdog_req != tx->watchdog_done &&
3015	    tx->done == tx->watchdog_done)
3016		mxge_watchdog_reset(sc);
3017
3018	tx->watchdog_req = tx->req;
3019	tx->watchdog_done = tx->done;
3020}
3021
3022static void
3023mxge_tick(void *arg)
3024{
3025	mxge_softc_t *sc = arg;
3026
3027
3028	/* Synchronize with possible callout reset/stop. */
3029	if (callout_pending(&sc->co_hdl) ||
3030	    !callout_active(&sc->co_hdl)) {
3031		mtx_unlock(&sc->driver_mtx);
3032		return;
3033	}
3034
3035	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3036	mxge_watchdog(sc);
3037}
3038
3039static int
3040mxge_media_change(struct ifnet *ifp)
3041{
3042	return EINVAL;
3043}
3044
3045static int
3046mxge_change_mtu(mxge_softc_t *sc, int mtu)
3047{
3048	struct ifnet *ifp = sc->ifp;
3049	int real_mtu, old_mtu;
3050	int err = 0;
3051
3052
3053	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3054	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3055		return EINVAL;
3056	mtx_lock(&sc->driver_mtx);
3057	old_mtu = ifp->if_mtu;
3058	ifp->if_mtu = mtu;
3059	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3060		callout_stop(&sc->co_hdl);
3061		mxge_close(sc);
3062		err = mxge_open(sc);
3063		if (err != 0) {
3064			ifp->if_mtu = old_mtu;
3065			mxge_close(sc);
3066			(void) mxge_open(sc);
3067		}
3068		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3069	}
3070	mtx_unlock(&sc->driver_mtx);
3071	return err;
3072}
3073
3074static void
3075mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3076{
3077	mxge_softc_t *sc = ifp->if_softc;
3078
3079
3080	if (sc == NULL)
3081		return;
3082	ifmr->ifm_status = IFM_AVALID;
3083	ifmr->ifm_status |= sc->fw_stats->link_up ? IFM_ACTIVE : 0;
3084	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3085	ifmr->ifm_active |= sc->fw_stats->link_up ? IFM_FDX : 0;
3086}
3087
3088static int
3089mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3090{
3091	mxge_softc_t *sc = ifp->if_softc;
3092	struct ifreq *ifr = (struct ifreq *)data;
3093	int err, mask;
3094
3095	err = 0;
3096	switch (command) {
3097	case SIOCSIFADDR:
3098	case SIOCGIFADDR:
3099		err = ether_ioctl(ifp, command, data);
3100		break;
3101
3102	case SIOCSIFMTU:
3103		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3104		break;
3105
3106	case SIOCSIFFLAGS:
3107		mtx_lock(&sc->driver_mtx);
3108		if (ifp->if_flags & IFF_UP) {
3109			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3110				err = mxge_open(sc);
3111				callout_reset(&sc->co_hdl, mxge_ticks,
3112					      mxge_tick, sc);
3113			} else {
3114				/* take care of promis can allmulti
3115				   flag chages */
3116				mxge_change_promisc(sc,
3117						    ifp->if_flags & IFF_PROMISC);
3118				mxge_set_multicast_list(sc);
3119			}
3120		} else {
3121			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3122				mxge_close(sc);
3123				callout_stop(&sc->co_hdl);
3124			}
3125		}
3126		mtx_unlock(&sc->driver_mtx);
3127		break;
3128
3129	case SIOCADDMULTI:
3130	case SIOCDELMULTI:
3131		mtx_lock(&sc->driver_mtx);
3132		mxge_set_multicast_list(sc);
3133		mtx_unlock(&sc->driver_mtx);
3134		break;
3135
3136	case SIOCSIFCAP:
3137		mtx_lock(&sc->driver_mtx);
3138		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3139		if (mask & IFCAP_TXCSUM) {
3140			if (IFCAP_TXCSUM & ifp->if_capenable) {
3141				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3142				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3143						      | CSUM_TSO);
3144			} else {
3145				ifp->if_capenable |= IFCAP_TXCSUM;
3146				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3147			}
3148		} else if (mask & IFCAP_RXCSUM) {
3149			if (IFCAP_RXCSUM & ifp->if_capenable) {
3150				ifp->if_capenable &= ~IFCAP_RXCSUM;
3151				sc->csum_flag = 0;
3152			} else {
3153				ifp->if_capenable |= IFCAP_RXCSUM;
3154				sc->csum_flag = 1;
3155			}
3156		}
3157		if (mask & IFCAP_TSO4) {
3158			if (IFCAP_TSO4 & ifp->if_capenable) {
3159				ifp->if_capenable &= ~IFCAP_TSO4;
3160				ifp->if_hwassist &= ~CSUM_TSO;
3161			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
3162				ifp->if_capenable |= IFCAP_TSO4;
3163				ifp->if_hwassist |= CSUM_TSO;
3164			} else {
3165				printf("mxge requires tx checksum offload"
3166				       " be enabled to use TSO\n");
3167				err = EINVAL;
3168			}
3169		}
3170		if (mask & IFCAP_LRO) {
3171			if (IFCAP_LRO & ifp->if_capenable)
3172				err = mxge_change_lro_locked(sc, 0);
3173			else
3174				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3175		}
3176		if (mask & IFCAP_VLAN_HWTAGGING)
3177			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3178		mtx_unlock(&sc->driver_mtx);
3179		VLAN_CAPABILITIES(ifp);
3180
3181		break;
3182
3183	case SIOCGIFMEDIA:
3184		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3185				    &sc->media, command);
3186                break;
3187
3188	default:
3189		err = ENOTTY;
3190        }
3191	return err;
3192}
3193
3194static void
3195mxge_fetch_tunables(mxge_softc_t *sc)
3196{
3197
3198	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3199			  &mxge_flow_control);
3200	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3201			  &mxge_intr_coal_delay);
3202	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
3203			  &mxge_nvidia_ecrc_enable);
3204	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
3205			  &mxge_force_firmware);
3206	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
3207			  &mxge_deassert_wait);
3208	TUNABLE_INT_FETCH("hw.mxge.verbose",
3209			  &mxge_verbose);
3210	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
3211	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
3212	printf("%d %d\n", sc->lro_cnt, mxge_lro_cnt);
3213	if (sc->lro_cnt != 0)
3214		mxge_lro_cnt = sc->lro_cnt;
3215
3216	if (bootverbose)
3217		mxge_verbose = 1;
3218	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
3219		mxge_intr_coal_delay = 30;
3220	if (mxge_ticks == 0)
3221		mxge_ticks = hz;
3222	sc->pause = mxge_flow_control;
3223
3224}
3225
3226static int
3227mxge_attach(device_t dev)
3228{
3229	mxge_softc_t *sc = device_get_softc(dev);
3230	struct ifnet *ifp;
3231	int count, rid, err;
3232
3233	sc->dev = dev;
3234	mxge_fetch_tunables(sc);
3235
3236	err = bus_dma_tag_create(NULL,			/* parent */
3237				 1,			/* alignment */
3238				 4096,			/* boundary */
3239				 BUS_SPACE_MAXADDR,	/* low */
3240				 BUS_SPACE_MAXADDR,	/* high */
3241				 NULL, NULL,		/* filter */
3242				 65536 + 256,		/* maxsize */
3243				 MXGE_MAX_SEND_DESC, 	/* num segs */
3244				 4096,			/* maxsegsize */
3245				 0,			/* flags */
3246				 NULL, NULL,		/* lock */
3247				 &sc->parent_dmat);	/* tag */
3248
3249	if (err != 0) {
3250		device_printf(sc->dev, "Err %d allocating parent dmat\n",
3251			      err);
3252		goto abort_with_nothing;
3253	}
3254
3255	ifp = sc->ifp = if_alloc(IFT_ETHER);
3256	if (ifp == NULL) {
3257		device_printf(dev, "can not if_alloc()\n");
3258		err = ENOSPC;
3259		goto abort_with_parent_dmat;
3260	}
3261	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
3262		 device_get_nameunit(dev));
3263	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
3264	snprintf(sc->tx_mtx_name, sizeof(sc->tx_mtx_name), "%s:tx",
3265		 device_get_nameunit(dev));
3266	mtx_init(&sc->tx_mtx, sc->tx_mtx_name, NULL, MTX_DEF);
3267	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
3268		 "%s:drv", device_get_nameunit(dev));
3269	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
3270		 MTX_NETWORK_LOCK, MTX_DEF);
3271
3272	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
3273
3274	mxge_setup_cfg_space(sc);
3275
3276	/* Map the board into the kernel */
3277	rid = PCIR_BARS;
3278	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
3279					 ~0, 1, RF_ACTIVE);
3280	if (sc->mem_res == NULL) {
3281		device_printf(dev, "could not map memory\n");
3282		err = ENXIO;
3283		goto abort_with_lock;
3284	}
3285	sc->sram = rman_get_virtual(sc->mem_res);
3286	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
3287	if (sc->sram_size > rman_get_size(sc->mem_res)) {
3288		device_printf(dev, "impossible memory region size %ld\n",
3289			      rman_get_size(sc->mem_res));
3290		err = ENXIO;
3291		goto abort_with_mem_res;
3292	}
3293
3294	/* make NULL terminated copy of the EEPROM strings section of
3295	   lanai SRAM */
3296	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
3297	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
3298				rman_get_bushandle(sc->mem_res),
3299				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
3300				sc->eeprom_strings,
3301				MXGE_EEPROM_STRINGS_SIZE - 2);
3302	err = mxge_parse_strings(sc);
3303	if (err != 0)
3304		goto abort_with_mem_res;
3305
3306	/* Enable write combining for efficient use of PCIe bus */
3307	mxge_enable_wc(sc);
3308
3309	/* Allocate the out of band dma memory */
3310	err = mxge_dma_alloc(sc, &sc->cmd_dma,
3311			     sizeof (mxge_cmd_t), 64);
3312	if (err != 0)
3313		goto abort_with_mem_res;
3314	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
3315	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
3316	if (err != 0)
3317		goto abort_with_cmd_dma;
3318
3319	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
3320			     sizeof (*sc->fw_stats), 64);
3321	if (err != 0)
3322		goto abort_with_zeropad_dma;
3323	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
3324
3325	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
3326	if (err != 0)
3327		goto abort_with_fw_stats;
3328
3329	/* Add our ithread  */
3330	count = pci_msi_count(dev);
3331	if (count == 1 && pci_alloc_msi(dev, &count) == 0) {
3332		rid = 1;
3333		sc->msi_enabled = 1;
3334	} else {
3335		rid = 0;
3336	}
3337	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
3338					 1, RF_SHAREABLE | RF_ACTIVE);
3339	if (sc->irq_res == NULL) {
3340		device_printf(dev, "could not alloc interrupt\n");
3341		goto abort_with_dmabench;
3342	}
3343	if (mxge_verbose)
3344		device_printf(dev, "using %s irq %ld\n",
3345			      sc->msi_enabled ? "MSI" : "INTx",
3346			      rman_get_start(sc->irq_res));
3347	/* select & load the firmware */
3348	err = mxge_select_firmware(sc);
3349	if (err != 0)
3350		goto abort_with_irq_res;
3351	sc->intr_coal_delay = mxge_intr_coal_delay;
3352	err = mxge_reset(sc, 0);
3353	if (err != 0)
3354		goto abort_with_irq_res;
3355
3356	err = mxge_alloc_rings(sc);
3357	if (err != 0) {
3358		device_printf(sc->dev, "failed to allocate rings\n");
3359		goto abort_with_irq_res;
3360	}
3361
3362	err = bus_setup_intr(sc->dev, sc->irq_res,
3363			     INTR_TYPE_NET | INTR_MPSAFE,
3364			     NULL, mxge_intr, sc, &sc->ih);
3365	if (err != 0) {
3366		goto abort_with_rings;
3367	}
3368	/* hook into the network stack */
3369	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
3370	ifp->if_baudrate = 100000000;
3371	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
3372		IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING |
3373		IFCAP_VLAN_HWCSUM | IFCAP_LRO;
3374
3375	sc->max_mtu = mxge_max_mtu(sc);
3376	if (sc->max_mtu >= 9000)
3377		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
3378	else
3379		device_printf(dev, "MTU limited to %d.  Install "
3380			      "latest firmware for 9000 byte jumbo support\n",
3381			      sc->max_mtu - ETHER_HDR_LEN);
3382	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
3383	ifp->if_capenable = ifp->if_capabilities;
3384	if (sc->lro_cnt == 0)
3385		ifp->if_capenable &= ~IFCAP_LRO;
3386	sc->csum_flag = 1;
3387        ifp->if_init = mxge_init;
3388        ifp->if_softc = sc;
3389        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
3390        ifp->if_ioctl = mxge_ioctl;
3391        ifp->if_start = mxge_start;
3392	ether_ifattach(ifp, sc->mac_addr);
3393	/* ether_ifattach sets mtu to 1500 */
3394	if (ifp->if_capabilities & IFCAP_JUMBO_MTU)
3395		ifp->if_mtu = 9000;
3396
3397	/* Initialise the ifmedia structure */
3398	ifmedia_init(&sc->media, 0, mxge_media_change,
3399		     mxge_media_status);
3400	ifmedia_add(&sc->media, IFM_ETHER|IFM_AUTO, 0, NULL);
3401	mxge_add_sysctls(sc);
3402	return 0;
3403
3404abort_with_rings:
3405	mxge_free_rings(sc);
3406abort_with_irq_res:
3407	bus_release_resource(dev, SYS_RES_IRQ,
3408			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3409	if (sc->msi_enabled)
3410		pci_release_msi(dev);
3411abort_with_dmabench:
3412	mxge_dma_free(&sc->dmabench_dma);
3413abort_with_fw_stats:
3414	mxge_dma_free(&sc->fw_stats_dma);
3415abort_with_zeropad_dma:
3416	mxge_dma_free(&sc->zeropad_dma);
3417abort_with_cmd_dma:
3418	mxge_dma_free(&sc->cmd_dma);
3419abort_with_mem_res:
3420	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3421abort_with_lock:
3422	pci_disable_busmaster(dev);
3423	mtx_destroy(&sc->cmd_mtx);
3424	mtx_destroy(&sc->tx_mtx);
3425	mtx_destroy(&sc->driver_mtx);
3426	if_free(ifp);
3427abort_with_parent_dmat:
3428	bus_dma_tag_destroy(sc->parent_dmat);
3429
3430abort_with_nothing:
3431	return err;
3432}
3433
3434static int
3435mxge_detach(device_t dev)
3436{
3437	mxge_softc_t *sc = device_get_softc(dev);
3438
3439	if (sc->ifp->if_vlantrunk != NULL) {
3440		device_printf(sc->dev,
3441			      "Detach vlans before removing module\n");
3442		return EBUSY;
3443	}
3444	mtx_lock(&sc->driver_mtx);
3445	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
3446		mxge_close(sc);
3447	callout_stop(&sc->co_hdl);
3448	mtx_unlock(&sc->driver_mtx);
3449	ether_ifdetach(sc->ifp);
3450	ifmedia_removeall(&sc->media);
3451	mxge_dummy_rdma(sc, 0);
3452	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
3453	mxge_free_rings(sc);
3454	bus_release_resource(dev, SYS_RES_IRQ,
3455			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3456	if (sc->msi_enabled)
3457		pci_release_msi(dev);
3458
3459	sc->rx_done.entry = NULL;
3460	mxge_dma_free(&sc->fw_stats_dma);
3461	mxge_dma_free(&sc->dmabench_dma);
3462	mxge_dma_free(&sc->zeropad_dma);
3463	mxge_dma_free(&sc->cmd_dma);
3464	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3465	pci_disable_busmaster(dev);
3466	mtx_destroy(&sc->cmd_mtx);
3467	mtx_destroy(&sc->tx_mtx);
3468	mtx_destroy(&sc->driver_mtx);
3469	if_free(sc->ifp);
3470	bus_dma_tag_destroy(sc->parent_dmat);
3471	return 0;
3472}
3473
3474static int
3475mxge_shutdown(device_t dev)
3476{
3477	return 0;
3478}
3479
3480/*
3481  This file uses Myri10GE driver indentation.
3482
3483  Local Variables:
3484  c-file-style:"linux"
3485  tab-width:8
3486  End:
3487*/
3488