if_mxge.c revision 168298
1/******************************************************************************
2
3Copyright (c) 2006, Myricom Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Redistributions in binary form must reproduce the above copyright
13    notice, this list of conditions and the following disclaimer in the
14    documentation and/or other materials provided with the distribution.
15
16 3. Neither the name of the Myricom Inc, nor the names of its
17    contributors may be used to endorse or promote products derived from
18    this software without specific prior written permission.
19
20THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30POSSIBILITY OF SUCH DAMAGE.
31
32***************************************************************************/
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: head/sys/dev/mxge/if_mxge.c 168298 2007-04-03 10:41:33Z gallatin $");
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/linker.h>
40#include <sys/firmware.h>
41#include <sys/endian.h>
42#include <sys/sockio.h>
43#include <sys/mbuf.h>
44#include <sys/malloc.h>
45#include <sys/kdb.h>
46#include <sys/kernel.h>
47#include <sys/lock.h>
48#include <sys/module.h>
49#include <sys/memrange.h>
50#include <sys/socket.h>
51#include <sys/sysctl.h>
52#include <sys/sx.h>
53
54#include <net/if.h>
55#include <net/if_arp.h>
56#include <net/ethernet.h>
57#include <net/if_dl.h>
58#include <net/if_media.h>
59
60#include <net/bpf.h>
61
62#include <net/if_types.h>
63#include <net/if_vlan_var.h>
64#include <net/zlib.h>
65
66#include <netinet/in_systm.h>
67#include <netinet/in.h>
68#include <netinet/ip.h>
69#include <netinet/tcp.h>
70
71#include <machine/bus.h>
72#include <machine/resource.h>
73#include <sys/bus.h>
74#include <sys/rman.h>
75
76#include <dev/pci/pcireg.h>
77#include <dev/pci/pcivar.h>
78
79#include <vm/vm.h>		/* for pmap_mapdev() */
80#include <vm/pmap.h>
81
82#include <dev/mxge/mxge_mcp.h>
83#include <dev/mxge/mcp_gen_header.h>
84#include <dev/mxge/if_mxge_var.h>
85
86/* tunable params */
87static int mxge_nvidia_ecrc_enable = 1;
88static int mxge_force_firmware = 0;
89static int mxge_max_intr_slots = 1024;
90static int mxge_intr_coal_delay = 30;
91static int mxge_deassert_wait = 1;
92static int mxge_flow_control = 1;
93static int mxge_verbose = 0;
94static int mxge_ticks;
95static char *mxge_fw_unaligned = "mxge_ethp_z8e";
96static char *mxge_fw_aligned = "mxge_eth_z8e";
97
98static int mxge_probe(device_t dev);
99static int mxge_attach(device_t dev);
100static int mxge_detach(device_t dev);
101static int mxge_shutdown(device_t dev);
102static void mxge_intr(void *arg);
103
104static device_method_t mxge_methods[] =
105{
106  /* Device interface */
107  DEVMETHOD(device_probe, mxge_probe),
108  DEVMETHOD(device_attach, mxge_attach),
109  DEVMETHOD(device_detach, mxge_detach),
110  DEVMETHOD(device_shutdown, mxge_shutdown),
111  {0, 0}
112};
113
114static driver_t mxge_driver =
115{
116  "mxge",
117  mxge_methods,
118  sizeof(mxge_softc_t),
119};
120
121static devclass_t mxge_devclass;
122
123/* Declare ourselves to be a child of the PCI bus.*/
124DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
125MODULE_DEPEND(mxge, firmware, 1, 1, 1);
126
127static int
128mxge_probe(device_t dev)
129{
130  if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
131      (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
132	  device_set_desc(dev, "Myri10G-PCIE-8A");
133	  return 0;
134  }
135  return ENXIO;
136}
137
138static void
139mxge_enable_wc(mxge_softc_t *sc)
140{
141	struct mem_range_desc mrdesc;
142	vm_paddr_t pa;
143	vm_offset_t len;
144	int err, action;
145
146	pa = rman_get_start(sc->mem_res);
147	len = rman_get_size(sc->mem_res);
148	mrdesc.mr_base = pa;
149	mrdesc.mr_len = len;
150	mrdesc.mr_flags = MDF_WRITECOMBINE;
151	action = MEMRANGE_SET_UPDATE;
152	strcpy((char *)&mrdesc.mr_owner, "mxge");
153	err = mem_range_attr_set(&mrdesc, &action);
154	if (err != 0) {
155		device_printf(sc->dev,
156			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
157			      (unsigned long)pa, (unsigned long)len, err);
158	} else {
159		sc->wc = 1;
160	}
161}
162
163
164/* callback to get our DMA address */
165static void
166mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
167			 int error)
168{
169	if (error == 0) {
170		*(bus_addr_t *) arg = segs->ds_addr;
171	}
172}
173
174static int
175mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
176		   bus_size_t alignment)
177{
178	int err;
179	device_t dev = sc->dev;
180
181	/* allocate DMAable memory tags */
182	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
183				 alignment,		/* alignment */
184				 4096,			/* boundary */
185				 BUS_SPACE_MAXADDR,	/* low */
186				 BUS_SPACE_MAXADDR,	/* high */
187				 NULL, NULL,		/* filter */
188				 bytes,			/* maxsize */
189				 1,			/* num segs */
190				 4096,			/* maxsegsize */
191				 BUS_DMA_COHERENT,	/* flags */
192				 NULL, NULL,		/* lock */
193				 &dma->dmat);		/* tag */
194	if (err != 0) {
195		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
196		return err;
197	}
198
199	/* allocate DMAable memory & map */
200	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
201			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
202				| BUS_DMA_ZERO),  &dma->map);
203	if (err != 0) {
204		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
205		goto abort_with_dmat;
206	}
207
208	/* load the memory */
209	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
210			      mxge_dmamap_callback,
211			      (void *)&dma->bus_addr, 0);
212	if (err != 0) {
213		device_printf(dev, "couldn't load map (err = %d)\n", err);
214		goto abort_with_mem;
215	}
216	return 0;
217
218abort_with_mem:
219	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
220abort_with_dmat:
221	(void)bus_dma_tag_destroy(dma->dmat);
222	return err;
223}
224
225
226static void
227mxge_dma_free(mxge_dma_t *dma)
228{
229	bus_dmamap_unload(dma->dmat, dma->map);
230	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
231	(void)bus_dma_tag_destroy(dma->dmat);
232}
233
234/*
235 * The eeprom strings on the lanaiX have the format
236 * SN=x\0
237 * MAC=x:x:x:x:x:x\0
238 * PC=text\0
239 */
240
241static int
242mxge_parse_strings(mxge_softc_t *sc)
243{
244#define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
245
246	char *ptr, *limit;
247	int i, found_mac;
248
249	ptr = sc->eeprom_strings;
250	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
251	found_mac = 0;
252	while (ptr < limit && *ptr != '\0') {
253		if (memcmp(ptr, "MAC=", 4) == 0) {
254			ptr += 1;
255			sc->mac_addr_string = ptr;
256			for (i = 0; i < 6; i++) {
257				ptr += 3;
258				if ((ptr + 2) > limit)
259					goto abort;
260				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
261				found_mac = 1;
262			}
263		} else if (memcmp(ptr, "PC=", 3) == 0) {
264			ptr += 3;
265			strncpy(sc->product_code_string, ptr,
266				sizeof (sc->product_code_string) - 1);
267		} else if (memcmp(ptr, "SN=", 3) == 0) {
268			ptr += 3;
269			strncpy(sc->serial_number_string, ptr,
270				sizeof (sc->serial_number_string) - 1);
271		}
272		MXGE_NEXT_STRING(ptr);
273	}
274
275	if (found_mac)
276		return 0;
277
278 abort:
279	device_printf(sc->dev, "failed to parse eeprom_strings\n");
280
281	return ENXIO;
282}
283
284#if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
285static int
286mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
287{
288	uint32_t val;
289	unsigned long off;
290	char *va, *cfgptr;
291	uint16_t vendor_id, device_id;
292	uintptr_t bus, slot, func, ivend, idev;
293	uint32_t *ptr32;
294
295	/* XXXX
296	   Test below is commented because it is believed that doing
297	   config read/write beyond 0xff will access the config space
298	   for the next larger function.  Uncomment this and remove
299	   the hacky pmap_mapdev() way of accessing config space when
300	   FreeBSD grows support for extended pcie config space access
301	*/
302#if 0
303	/* See if we can, by some miracle, access the extended
304	   config space */
305	val = pci_read_config(pdev, 0x178, 4);
306	if (val != 0xffffffff) {
307		val |= 0x40;
308		pci_write_config(pdev, 0x178, val, 4);
309		return 0;
310	}
311#endif
312	/* Rather than using normal pci config space writes, we must
313	 * map the Nvidia config space ourselves.  This is because on
314	 * opteron/nvidia class machine the 0xe000000 mapping is
315	 * handled by the nvidia chipset, that means the internal PCI
316	 * device (the on-chip northbridge), or the amd-8131 bridge
317	 * and things behind them are not visible by this method.
318	 */
319
320	BUS_READ_IVAR(device_get_parent(pdev), pdev,
321		      PCI_IVAR_BUS, &bus);
322	BUS_READ_IVAR(device_get_parent(pdev), pdev,
323		      PCI_IVAR_SLOT, &slot);
324	BUS_READ_IVAR(device_get_parent(pdev), pdev,
325		      PCI_IVAR_FUNCTION, &func);
326	BUS_READ_IVAR(device_get_parent(pdev), pdev,
327		      PCI_IVAR_VENDOR, &ivend);
328	BUS_READ_IVAR(device_get_parent(pdev), pdev,
329		      PCI_IVAR_DEVICE, &idev);
330
331	off =  0xe0000000UL
332		+ 0x00100000UL * (unsigned long)bus
333		+ 0x00001000UL * (unsigned long)(func
334						 + 8 * slot);
335
336	/* map it into the kernel */
337	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
338
339
340	if (va == NULL) {
341		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
342		return EIO;
343	}
344	/* get a pointer to the config space mapped into the kernel */
345	cfgptr = va + (off & PAGE_MASK);
346
347	/* make sure that we can really access it */
348	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
349	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
350	if (! (vendor_id == ivend && device_id == idev)) {
351		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
352			      vendor_id, device_id);
353		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
354		return EIO;
355	}
356
357	ptr32 = (uint32_t*)(cfgptr + 0x178);
358	val = *ptr32;
359
360	if (val == 0xffffffff) {
361		device_printf(sc->dev, "extended mapping failed\n");
362		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
363		return EIO;
364	}
365	*ptr32 = val | 0x40;
366	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
367	if (mxge_verbose)
368		device_printf(sc->dev,
369			      "Enabled ECRC on upstream Nvidia bridge "
370			      "at %d:%d:%d\n",
371			      (int)bus, (int)slot, (int)func);
372	return 0;
373}
374#else
375static int
376mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
377{
378	device_printf(sc->dev,
379		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
380	return ENXIO;
381}
382#endif
383/*
384 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
385 * when the PCI-E Completion packets are aligned on an 8-byte
386 * boundary.  Some PCI-E chip sets always align Completion packets; on
387 * the ones that do not, the alignment can be enforced by enabling
388 * ECRC generation (if supported).
389 *
390 * When PCI-E Completion packets are not aligned, it is actually more
391 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
392 *
393 * If the driver can neither enable ECRC nor verify that it has
394 * already been enabled, then it must use a firmware image which works
395 * around unaligned completion packets (ethp_z8e.dat), and it should
396 * also ensure that it never gives the device a Read-DMA which is
397 * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
398 * enabled, then the driver should use the aligned (eth_z8e.dat)
399 * firmware image, and set tx.boundary to 4KB.
400 */
401
402static void
403mxge_select_firmware(mxge_softc_t *sc)
404{
405	int err, aligned = 0;
406	device_t pdev;
407	uint16_t pvend, pdid;
408
409
410	if (mxge_force_firmware != 0) {
411		if (mxge_force_firmware == 1)
412			aligned = 1;
413		else
414			aligned = 0;
415		if (mxge_verbose)
416			device_printf(sc->dev,
417				      "Assuming %s completions (forced)\n",
418				      aligned ? "aligned" : "unaligned");
419		goto abort;
420	}
421
422	/* if the PCIe link width is 4 or less, we can use the aligned
423	   firmware and skip any checks */
424	if (sc->link_width != 0 && sc->link_width <= 4) {
425		device_printf(sc->dev,
426			      "PCIe x%d Link, expect reduced performance\n",
427			      sc->link_width);
428		aligned = 1;
429		goto abort;
430	}
431
432	pdev = device_get_parent(device_get_parent(sc->dev));
433	if (pdev == NULL) {
434		device_printf(sc->dev, "could not find parent?\n");
435		goto abort;
436	}
437	pvend = pci_read_config(pdev, PCIR_VENDOR, 2);
438	pdid = pci_read_config(pdev, PCIR_DEVICE, 2);
439
440	/* see if we can enable ECRC's on an upstream
441	   Nvidia bridge */
442	if (mxge_nvidia_ecrc_enable &&
443	    (pvend == 0x10de && pdid == 0x005d)) {
444		err = mxge_enable_nvidia_ecrc(sc, pdev);
445		if (err == 0) {
446			aligned = 1;
447			if (mxge_verbose)
448				device_printf(sc->dev,
449					      "Assuming aligned completions"
450					      " (ECRC)\n");
451		}
452	}
453	/* see if the upstream bridge is known to
454	   provided aligned completions */
455	if (/* HT2000 */ (pvend == 0x1166 && pdid == 0x0132) ||
456	    /* PLX */    (pvend == 0x10b5 && pdid == 0x8532) ||
457	    /* Intel */  (pvend == 0x8086 &&
458	      /* E5000 NorthBridge*/((pdid >= 0x25f7 && pdid <= 0x25fa) ||
459	      /* E5000 SouthBridge*/ (pdid >= 0x3510 && pdid <= 0x351b)))) {
460		aligned = 1;
461		if (mxge_verbose)
462			device_printf(sc->dev,
463				      "Assuming aligned completions "
464				      "(0x%x:0x%x)\n", pvend, pdid);
465	}
466
467abort:
468	if (aligned) {
469		sc->fw_name = mxge_fw_aligned;
470		sc->tx.boundary = 4096;
471	} else {
472		sc->fw_name = mxge_fw_unaligned;
473		sc->tx.boundary = 2048;
474	}
475}
476
477union qualhack
478{
479        const char *ro_char;
480        char *rw_char;
481};
482
483static int
484mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
485{
486
487
488	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
489		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
490			      be32toh(hdr->mcp_type));
491		return EIO;
492	}
493
494	/* save firmware version for sysctl */
495	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
496	if (mxge_verbose)
497		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
498
499	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
500	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
501
502	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
503	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
504		device_printf(sc->dev, "Found firmware version %s\n",
505			      sc->fw_version);
506		device_printf(sc->dev, "Driver needs %d.%d\n",
507			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
508		return EINVAL;
509	}
510	return 0;
511
512}
513
514static int
515mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
516{
517	const struct firmware *fw;
518	const mcp_gen_header_t *hdr;
519	unsigned hdr_offset;
520	const char *fw_data;
521	union qualhack hack;
522	int status;
523	unsigned int i;
524	char dummy;
525
526
527	fw = firmware_get(sc->fw_name);
528
529	if (fw == NULL) {
530		device_printf(sc->dev, "Could not find firmware image %s\n",
531			      sc->fw_name);
532		return ENOENT;
533	}
534	if (fw->datasize > *limit ||
535	    fw->datasize < MCP_HEADER_PTR_OFFSET + 4) {
536		device_printf(sc->dev, "Firmware image %s too large (%d/%d)\n",
537			      sc->fw_name, (int)fw->datasize, (int) *limit);
538		status = ENOSPC;
539		goto abort_with_fw;
540	}
541	*limit = fw->datasize;
542
543	/* check id */
544	fw_data = (const char *)fw->data;
545	hdr_offset = htobe32(*(const uint32_t *)
546			     (fw_data + MCP_HEADER_PTR_OFFSET));
547	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->datasize) {
548		device_printf(sc->dev, "Bad firmware file");
549		status = EIO;
550		goto abort_with_fw;
551	}
552	hdr = (const void*)(fw_data + hdr_offset);
553
554	status = mxge_validate_firmware(sc, hdr);
555	if (status != 0)
556		goto abort_with_fw;
557
558	hack.ro_char = fw_data;
559	/* Copy the inflated firmware to NIC SRAM. */
560	for (i = 0; i < *limit; i += 256) {
561		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
562			      hack.rw_char + i,
563			      min(256U, (unsigned)(*limit - i)));
564		mb();
565		dummy = *sc->sram;
566		mb();
567	}
568
569	status = 0;
570abort_with_fw:
571	firmware_put(fw, FIRMWARE_UNLOAD);
572	return status;
573}
574
575/*
576 * Enable or disable periodic RDMAs from the host to make certain
577 * chipsets resend dropped PCIe messages
578 */
579
580static void
581mxge_dummy_rdma(mxge_softc_t *sc, int enable)
582{
583	char buf_bytes[72];
584	volatile uint32_t *confirm;
585	volatile char *submit;
586	uint32_t *buf, dma_low, dma_high;
587	int i;
588
589	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
590
591	/* clear confirmation addr */
592	confirm = (volatile uint32_t *)sc->cmd;
593	*confirm = 0;
594	mb();
595
596	/* send an rdma command to the PCIe engine, and wait for the
597	   response in the confirmation address.  The firmware should
598	   write a -1 there to indicate it is alive and well
599	*/
600
601	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
602	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
603	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
604	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
605	buf[2] = htobe32(0xffffffff);		/* confirm data */
606	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
607	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
608	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
609	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
610	buf[5] = htobe32(enable);			/* enable? */
611
612
613	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
614
615	mxge_pio_copy(submit, buf, 64);
616	mb();
617	DELAY(1000);
618	mb();
619	i = 0;
620	while (*confirm != 0xffffffff && i < 20) {
621		DELAY(1000);
622		i++;
623	}
624	if (*confirm != 0xffffffff) {
625		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
626			      (enable ? "enable" : "disable"), confirm,
627			      *confirm);
628	}
629	return;
630}
631
632static int
633mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
634{
635	mcp_cmd_t *buf;
636	char buf_bytes[sizeof(*buf) + 8];
637	volatile mcp_cmd_response_t *response = sc->cmd;
638	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
639	uint32_t dma_low, dma_high;
640	int sleep_total = 0;
641
642	/* ensure buf is aligned to 8 bytes */
643	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
644
645	buf->data0 = htobe32(data->data0);
646	buf->data1 = htobe32(data->data1);
647	buf->data2 = htobe32(data->data2);
648	buf->cmd = htobe32(cmd);
649	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
650	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
651
652	buf->response_addr.low = htobe32(dma_low);
653	buf->response_addr.high = htobe32(dma_high);
654	mtx_lock(&sc->cmd_mtx);
655	response->result = 0xffffffff;
656	mb();
657	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
658
659	/* wait up to 20ms */
660	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
661		bus_dmamap_sync(sc->cmd_dma.dmat,
662				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
663		mb();
664		if (response->result != 0xffffffff) {
665			if (response->result == 0) {
666				data->data0 = be32toh(response->data);
667				mtx_unlock(&sc->cmd_mtx);
668				return 0;
669			} else {
670				device_printf(sc->dev,
671					      "mxge: command %d "
672					      "failed, result = %d\n",
673					      cmd, be32toh(response->result));
674				mtx_unlock(&sc->cmd_mtx);
675				return ENXIO;
676			}
677		}
678		DELAY(1000);
679	}
680	mtx_unlock(&sc->cmd_mtx);
681	device_printf(sc->dev, "mxge: command %d timed out"
682		      "result = %d\n",
683		      cmd, be32toh(response->result));
684	return EAGAIN;
685}
686
687static int
688mxge_adopt_running_firmware(mxge_softc_t *sc)
689{
690	struct mcp_gen_header *hdr;
691	const size_t bytes = sizeof (struct mcp_gen_header);
692	size_t hdr_offset;
693	int status;
694
695	/* find running firmware header */
696	hdr_offset = htobe32(*(volatile uint32_t *)
697			     (sc->sram + MCP_HEADER_PTR_OFFSET));
698
699	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
700		device_printf(sc->dev,
701			      "Running firmware has bad header offset (%d)\n",
702			      (int)hdr_offset);
703		return EIO;
704	}
705
706	/* copy header of running firmware from SRAM to host memory to
707	 * validate firmware */
708	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
709	if (hdr == NULL) {
710		device_printf(sc->dev, "could not malloc firmware hdr\n");
711		return ENOMEM;
712	}
713	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
714				rman_get_bushandle(sc->mem_res),
715				hdr_offset, (char *)hdr, bytes);
716	status = mxge_validate_firmware(sc, hdr);
717	free(hdr, M_DEVBUF);
718
719	/*
720	 * check to see if adopted firmware has bug where adopting
721	 * it will cause broadcasts to be filtered unless the NIC
722	 * is kept in ALLMULTI mode
723	 */
724	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
725	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
726		sc->adopted_rx_filter_bug = 1;
727		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
728			      "working around rx filter bug\n",
729			      sc->fw_ver_major, sc->fw_ver_minor,
730			      sc->fw_ver_tiny);
731	}
732
733	return status;
734}
735
736
737static int
738mxge_load_firmware(mxge_softc_t *sc)
739{
740	volatile uint32_t *confirm;
741	volatile char *submit;
742	char buf_bytes[72];
743	uint32_t *buf, size, dma_low, dma_high;
744	int status, i;
745
746	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
747
748	size = sc->sram_size;
749	status = mxge_load_firmware_helper(sc, &size);
750	if (status) {
751		/* Try to use the currently running firmware, if
752		   it is new enough */
753		status = mxge_adopt_running_firmware(sc);
754		if (status) {
755			device_printf(sc->dev,
756				      "failed to adopt running firmware\n");
757			return status;
758		}
759		device_printf(sc->dev,
760			      "Successfully adopted running firmware\n");
761		if (sc->tx.boundary == 4096) {
762			device_printf(sc->dev,
763				"Using firmware currently running on NIC"
764				 ".  For optimal\n");
765			device_printf(sc->dev,
766				 "performance consider loading optimized "
767				 "firmware\n");
768		}
769		sc->fw_name = mxge_fw_unaligned;
770		sc->tx.boundary = 2048;
771		return 0;
772	}
773	/* clear confirmation addr */
774	confirm = (volatile uint32_t *)sc->cmd;
775	*confirm = 0;
776	mb();
777	/* send a reload command to the bootstrap MCP, and wait for the
778	   response in the confirmation address.  The firmware should
779	   write a -1 there to indicate it is alive and well
780	*/
781
782	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
783	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
784
785	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
786	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
787	buf[2] = htobe32(0xffffffff);	/* confirm data */
788
789	/* FIX: All newest firmware should un-protect the bottom of
790	   the sram before handoff. However, the very first interfaces
791	   do not. Therefore the handoff copy must skip the first 8 bytes
792	*/
793					/* where the code starts*/
794	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
795	buf[4] = htobe32(size - 8); 	/* length of code */
796	buf[5] = htobe32(8);		/* where to copy to */
797	buf[6] = htobe32(0);		/* where to jump to */
798
799	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
800	mxge_pio_copy(submit, buf, 64);
801	mb();
802	DELAY(1000);
803	mb();
804	i = 0;
805	while (*confirm != 0xffffffff && i < 20) {
806		DELAY(1000*10);
807		i++;
808		bus_dmamap_sync(sc->cmd_dma.dmat,
809				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
810	}
811	if (*confirm != 0xffffffff) {
812		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
813			confirm, *confirm);
814
815		return ENXIO;
816	}
817	return 0;
818}
819
820static int
821mxge_update_mac_address(mxge_softc_t *sc)
822{
823	mxge_cmd_t cmd;
824	uint8_t *addr = sc->mac_addr;
825	int status;
826
827
828	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
829		     | (addr[2] << 8) | addr[3]);
830
831	cmd.data1 = ((addr[4] << 8) | (addr[5]));
832
833	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
834	return status;
835}
836
837static int
838mxge_change_pause(mxge_softc_t *sc, int pause)
839{
840	mxge_cmd_t cmd;
841	int status;
842
843	if (pause)
844		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
845				       &cmd);
846	else
847		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
848				       &cmd);
849
850	if (status) {
851		device_printf(sc->dev, "Failed to set flow control mode\n");
852		return ENXIO;
853	}
854	sc->pause = pause;
855	return 0;
856}
857
858static void
859mxge_change_promisc(mxge_softc_t *sc, int promisc)
860{
861	mxge_cmd_t cmd;
862	int status;
863
864	if (promisc)
865		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
866				       &cmd);
867	else
868		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
869				       &cmd);
870
871	if (status) {
872		device_printf(sc->dev, "Failed to set promisc mode\n");
873	}
874}
875
876static void
877mxge_set_multicast_list(mxge_softc_t *sc)
878{
879	mxge_cmd_t cmd;
880	struct ifmultiaddr *ifma;
881	struct ifnet *ifp = sc->ifp;
882	int err;
883
884	/* This firmware is known to not support multicast */
885	if (!sc->fw_multicast_support)
886		return;
887
888	/* Disable multicast filtering while we play with the lists*/
889	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
890	if (err != 0) {
891		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
892		       " error status: %d\n", err);
893		return;
894	}
895
896	if (sc->adopted_rx_filter_bug)
897		return;
898
899	if (ifp->if_flags & IFF_ALLMULTI)
900		/* request to disable multicast filtering, so quit here */
901		return;
902
903	/* Flush all the filters */
904
905	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
906	if (err != 0) {
907		device_printf(sc->dev,
908			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
909			      ", error status: %d\n", err);
910		return;
911	}
912
913	/* Walk the multicast list, and add each address */
914
915	IF_ADDR_LOCK(ifp);
916	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
917		if (ifma->ifma_addr->sa_family != AF_LINK)
918			continue;
919		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
920		      &cmd.data0, 4);
921		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
922		      &cmd.data1, 2);
923		cmd.data0 = htonl(cmd.data0);
924		cmd.data1 = htonl(cmd.data1);
925		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
926		if (err != 0) {
927			device_printf(sc->dev, "Failed "
928			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
929			       "%d\t", err);
930			/* abort, leaving multicast filtering off */
931			IF_ADDR_UNLOCK(ifp);
932			return;
933		}
934	}
935	IF_ADDR_UNLOCK(ifp);
936	/* Enable multicast filtering */
937	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
938	if (err != 0) {
939		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
940		       ", error status: %d\n", err);
941	}
942}
943
944
945static int
946mxge_reset(mxge_softc_t *sc)
947{
948
949	mxge_cmd_t cmd;
950	size_t bytes;
951	int status;
952
953	/* try to send a reset command to the card to see if it
954	   is alive */
955	memset(&cmd, 0, sizeof (cmd));
956	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
957	if (status != 0) {
958		device_printf(sc->dev, "failed reset\n");
959		return ENXIO;
960	}
961
962	mxge_dummy_rdma(sc, 1);
963
964	/* Now exchange information about interrupts  */
965	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);\
966	memset(sc->rx_done.entry, 0, bytes);
967	cmd.data0 = (uint32_t)bytes;
968	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
969	cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
970	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
971	status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
972
973	status |= mxge_send_cmd(sc,
974				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
975
976
977	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
978
979	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
980	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
981
982
983	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
984				&cmd);
985	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
986	if (status != 0) {
987		device_printf(sc->dev, "failed set interrupt parameters\n");
988		return status;
989	}
990
991
992	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
993
994
995	/* run a DMA benchmark */
996	sc->read_dma = sc->write_dma = sc->read_write_dma = 0;
997
998	/* Read DMA */
999	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
1000	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
1001	cmd.data2 = sc->tx.boundary * 0x10000;
1002
1003	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
1004	if (status != 0)
1005		device_printf(sc->dev, "read dma benchmark failed\n");
1006	else
1007		sc->read_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
1008			(cmd.data0 & 0xffff);
1009
1010	/* Write DMA */
1011	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
1012	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
1013	cmd.data2 = sc->tx.boundary * 0x1;
1014	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
1015	if (status != 0)
1016		device_printf(sc->dev, "write dma benchmark failed\n");
1017	else
1018		sc->write_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
1019			(cmd.data0 & 0xffff);
1020	/* Read/Write DMA */
1021	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
1022	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
1023	cmd.data2 = sc->tx.boundary * 0x10001;
1024	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
1025	if (status != 0)
1026		device_printf(sc->dev, "read/write dma benchmark failed\n");
1027	else
1028		sc->read_write_dma =
1029			((cmd.data0>>16) * sc->tx.boundary * 2 * 2) /
1030			(cmd.data0 & 0xffff);
1031
1032	/* reset mcp/driver shared state back to 0 */
1033	bzero(sc->rx_done.entry, bytes);
1034	sc->rx_done.idx = 0;
1035	sc->rx_done.cnt = 0;
1036	sc->tx.req = 0;
1037	sc->tx.done = 0;
1038	sc->tx.pkt_done = 0;
1039	sc->tx.wake = 0;
1040	sc->tx.stall = 0;
1041	sc->rx_big.cnt = 0;
1042	sc->rx_small.cnt = 0;
1043	sc->rdma_tags_available = 15;
1044	sc->fw_stats->valid = 0;
1045	sc->fw_stats->send_done_count = 0;
1046	status = mxge_update_mac_address(sc);
1047	mxge_change_promisc(sc, 0);
1048	mxge_change_pause(sc, sc->pause);
1049	mxge_set_multicast_list(sc);
1050	return status;
1051}
1052
1053static int
1054mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1055{
1056        mxge_softc_t *sc;
1057        unsigned int intr_coal_delay;
1058        int err;
1059
1060        sc = arg1;
1061        intr_coal_delay = sc->intr_coal_delay;
1062        err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1063        if (err != 0) {
1064                return err;
1065        }
1066        if (intr_coal_delay == sc->intr_coal_delay)
1067                return 0;
1068
1069        if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1070                return EINVAL;
1071
1072	mtx_lock(&sc->driver_mtx);
1073	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1074	sc->intr_coal_delay = intr_coal_delay;
1075
1076	mtx_unlock(&sc->driver_mtx);
1077        return err;
1078}
1079
1080static int
1081mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1082{
1083        mxge_softc_t *sc;
1084        unsigned int enabled;
1085        int err;
1086
1087        sc = arg1;
1088        enabled = sc->pause;
1089        err = sysctl_handle_int(oidp, &enabled, arg2, req);
1090        if (err != 0) {
1091                return err;
1092        }
1093        if (enabled == sc->pause)
1094                return 0;
1095
1096	mtx_lock(&sc->driver_mtx);
1097	err = mxge_change_pause(sc, enabled);
1098	mtx_unlock(&sc->driver_mtx);
1099        return err;
1100}
1101
1102static int
1103mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1104{
1105        int err;
1106
1107        if (arg1 == NULL)
1108                return EFAULT;
1109        arg2 = be32toh(*(int *)arg1);
1110        arg1 = NULL;
1111        err = sysctl_handle_int(oidp, arg1, arg2, req);
1112
1113        return err;
1114}
1115
1116static void
1117mxge_add_sysctls(mxge_softc_t *sc)
1118{
1119	struct sysctl_ctx_list *ctx;
1120	struct sysctl_oid_list *children;
1121	mcp_irq_data_t *fw;
1122
1123	ctx = device_get_sysctl_ctx(sc->dev);
1124	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1125	fw = sc->fw_stats;
1126
1127	/* random information */
1128	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1129		       "firmware_version",
1130		       CTLFLAG_RD, &sc->fw_version,
1131		       0, "firmware version");
1132	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1133		       "serial_number",
1134		       CTLFLAG_RD, &sc->serial_number_string,
1135		       0, "serial number");
1136	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1137		       "product_code",
1138		       CTLFLAG_RD, &sc->product_code_string,
1139		       0, "product_code");
1140	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1141		       "pcie_link_width",
1142		       CTLFLAG_RD, &sc->link_width,
1143		       0, "tx_boundary");
1144	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1145		       "tx_boundary",
1146		       CTLFLAG_RD, &sc->tx.boundary,
1147		       0, "tx_boundary");
1148	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1149		       "write_combine",
1150		       CTLFLAG_RD, &sc->wc,
1151		       0, "write combining PIO?");
1152	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1153		       "read_dma_MBs",
1154		       CTLFLAG_RD, &sc->read_dma,
1155		       0, "DMA Read speed in MB/s");
1156	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1157		       "write_dma_MBs",
1158		       CTLFLAG_RD, &sc->write_dma,
1159		       0, "DMA Write speed in MB/s");
1160	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1161		       "read_write_dma_MBs",
1162		       CTLFLAG_RD, &sc->read_write_dma,
1163		       0, "DMA concurrent Read/Write speed in MB/s");
1164
1165
1166	/* performance related tunables */
1167	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1168			"intr_coal_delay",
1169			CTLTYPE_INT|CTLFLAG_RW, sc,
1170			0, mxge_change_intr_coal,
1171			"I", "interrupt coalescing delay in usecs");
1172
1173	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1174			"flow_control_enabled",
1175			CTLTYPE_INT|CTLFLAG_RW, sc,
1176			0, mxge_change_flow_control,
1177			"I", "interrupt coalescing delay in usecs");
1178
1179	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1180		       "deassert_wait",
1181		       CTLFLAG_RW, &mxge_deassert_wait,
1182		       0, "Wait for IRQ line to go low in ihandler");
1183
1184	/* stats block from firmware is in network byte order.
1185	   Need to swap it */
1186	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1187			"link_up",
1188			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1189			0, mxge_handle_be32,
1190			"I", "link up");
1191	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1192			"rdma_tags_available",
1193			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1194			0, mxge_handle_be32,
1195			"I", "rdma_tags_available");
1196	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1197			"dropped_link_overflow",
1198			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1199			0, mxge_handle_be32,
1200			"I", "dropped_link_overflow");
1201	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1202			"dropped_link_error_or_filtered",
1203			CTLTYPE_INT|CTLFLAG_RD,
1204			&fw->dropped_link_error_or_filtered,
1205			0, mxge_handle_be32,
1206			"I", "dropped_link_error_or_filtered");
1207	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1208			"dropped_multicast_filtered",
1209			CTLTYPE_INT|CTLFLAG_RD,
1210			&fw->dropped_multicast_filtered,
1211			0, mxge_handle_be32,
1212			"I", "dropped_multicast_filtered");
1213	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1214			"dropped_runt",
1215			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1216			0, mxge_handle_be32,
1217			"I", "dropped_runt");
1218	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1219			"dropped_overrun",
1220			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1221			0, mxge_handle_be32,
1222			"I", "dropped_overrun");
1223	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1224			"dropped_no_small_buffer",
1225			CTLTYPE_INT|CTLFLAG_RD,
1226			&fw->dropped_no_small_buffer,
1227			0, mxge_handle_be32,
1228			"I", "dropped_no_small_buffer");
1229	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1230			"dropped_no_big_buffer",
1231			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1232			0, mxge_handle_be32,
1233			"I", "dropped_no_big_buffer");
1234
1235	/* host counters exported for debugging */
1236	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1237		       "rx_small_cnt",
1238		       CTLFLAG_RD, &sc->rx_small.cnt,
1239		       0, "rx_small_cnt");
1240	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1241		       "rx_big_cnt",
1242		       CTLFLAG_RD, &sc->rx_big.cnt,
1243		       0, "rx_small_cnt");
1244	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1245		       "tx_req",
1246		       CTLFLAG_RD, &sc->tx.req,
1247		       0, "tx_req");
1248	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1249		       "tx_done",
1250		       CTLFLAG_RD, &sc->tx.done,
1251		       0, "tx_done");
1252	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1253		       "tx_pkt_done",
1254		       CTLFLAG_RD, &sc->tx.pkt_done,
1255		       0, "tx_done");
1256	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1257		       "tx_stall",
1258		       CTLFLAG_RD, &sc->tx.stall,
1259		       0, "tx_stall");
1260	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1261		       "tx_wake",
1262		       CTLFLAG_RD, &sc->tx.wake,
1263		       0, "tx_wake");
1264
1265	/* verbose printing? */
1266	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1267		       "verbose",
1268		       CTLFLAG_RW, &mxge_verbose,
1269		       0, "verbose printing");
1270
1271}
1272
1273/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1274   backwards one at a time and handle ring wraps */
1275
1276static inline void
1277mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1278			    mcp_kreq_ether_send_t *src, int cnt)
1279{
1280        int idx, starting_slot;
1281        starting_slot = tx->req;
1282        while (cnt > 1) {
1283                cnt--;
1284                idx = (starting_slot + cnt) & tx->mask;
1285                mxge_pio_copy(&tx->lanai[idx],
1286			      &src[cnt], sizeof(*src));
1287                mb();
1288        }
1289}
1290
1291/*
1292 * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1293 * at most 32 bytes at a time, so as to avoid involving the software
1294 * pio handler in the nic.   We re-write the first segment's flags
1295 * to mark them valid only after writing the entire chain
1296 */
1297
1298static inline void
1299mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1300                  int cnt)
1301{
1302        int idx, i;
1303        uint32_t *src_ints;
1304	volatile uint32_t *dst_ints;
1305        mcp_kreq_ether_send_t *srcp;
1306	volatile mcp_kreq_ether_send_t *dstp, *dst;
1307	uint8_t last_flags;
1308
1309        idx = tx->req & tx->mask;
1310
1311	last_flags = src->flags;
1312	src->flags = 0;
1313        mb();
1314        dst = dstp = &tx->lanai[idx];
1315        srcp = src;
1316
1317        if ((idx + cnt) < tx->mask) {
1318                for (i = 0; i < (cnt - 1); i += 2) {
1319                        mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1320                        mb(); /* force write every 32 bytes */
1321                        srcp += 2;
1322                        dstp += 2;
1323                }
1324        } else {
1325                /* submit all but the first request, and ensure
1326                   that it is submitted below */
1327                mxge_submit_req_backwards(tx, src, cnt);
1328                i = 0;
1329        }
1330        if (i < cnt) {
1331                /* submit the first request */
1332                mxge_pio_copy(dstp, srcp, sizeof(*src));
1333                mb(); /* barrier before setting valid flag */
1334        }
1335
1336        /* re-write the last 32-bits with the valid flags */
1337        src->flags = last_flags;
1338        src_ints = (uint32_t *)src;
1339        src_ints+=3;
1340        dst_ints = (volatile uint32_t *)dst;
1341        dst_ints+=3;
1342        *dst_ints =  *src_ints;
1343        tx->req += cnt;
1344        mb();
1345}
1346
1347static inline void
1348mxge_submit_req_wc(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1349{
1350    tx->req += cnt;
1351    mb();
1352    while (cnt >= 4) {
1353	    mxge_pio_copy((volatile char *)tx->wc_fifo, src, 64);
1354	    mb();
1355	    src += 4;
1356	    cnt -= 4;
1357    }
1358    if (cnt > 0) {
1359	    /* pad it to 64 bytes.  The src is 64 bytes bigger than it
1360	       needs to be so that we don't overrun it */
1361	    mxge_pio_copy(tx->wc_fifo + MXGEFW_ETH_SEND_OFFSET(cnt), src, 64);
1362	    mb();
1363    }
1364}
1365
1366static void
1367mxge_encap_tso(mxge_softc_t *sc, struct mbuf *m, int busdma_seg_cnt)
1368{
1369	mxge_tx_buf_t *tx;
1370	mcp_kreq_ether_send_t *req;
1371	bus_dma_segment_t *seg;
1372	struct ether_header *eh;
1373	struct ip *ip;
1374	struct tcphdr *tcp;
1375	uint32_t low, high_swapped;
1376	int len, seglen, cum_len, cum_len_next;
1377	int next_is_first, chop, cnt, rdma_count, small;
1378	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1379	uint8_t flags, flags_next;
1380	static int once;
1381
1382	mss = m->m_pkthdr.tso_segsz;
1383
1384	/* negative cum_len signifies to the
1385	 * send loop that we are still in the
1386	 * header portion of the TSO packet.
1387	 */
1388
1389	/* ensure we have the ethernet, IP and TCP
1390	   header together in the first mbuf, copy
1391	   it to a scratch buffer if not */
1392	if (__predict_false(m->m_len < sizeof (*eh)
1393			    + sizeof (*ip))) {
1394		m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1395			   sc->scratch);
1396		eh = (struct ether_header *)sc->scratch;
1397	} else {
1398		eh = mtod(m, struct ether_header *);
1399	}
1400	ip = (struct ip *) (eh + 1);
1401	if (__predict_false(m->m_len < sizeof (*eh) + (ip->ip_hl << 2)
1402			    + sizeof (*tcp))) {
1403		m_copydata(m, 0, sizeof (*eh) + (ip->ip_hl << 2)
1404			   + sizeof (*tcp),  sc->scratch);
1405		eh = (struct ether_header *) sc->scratch;
1406		ip = (struct ip *) (eh + 1);
1407	}
1408
1409	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1410	cum_len = -(sizeof (*eh) + ((ip->ip_hl + tcp->th_off) << 2));
1411
1412	/* TSO implies checksum offload on this hardware */
1413	cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1414	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1415
1416
1417	/* for TSO, pseudo_hdr_offset holds mss.
1418	 * The firmware figures out where to put
1419	 * the checksum by parsing the header. */
1420	pseudo_hdr_offset = htobe16(mss);
1421
1422	tx = &sc->tx;
1423	req = tx->req_list;
1424	seg = tx->seg_list;
1425	cnt = 0;
1426	rdma_count = 0;
1427	/* "rdma_count" is the number of RDMAs belonging to the
1428	 * current packet BEFORE the current send request. For
1429	 * non-TSO packets, this is equal to "count".
1430	 * For TSO packets, rdma_count needs to be reset
1431	 * to 0 after a segment cut.
1432	 *
1433	 * The rdma_count field of the send request is
1434	 * the number of RDMAs of the packet starting at
1435	 * that request. For TSO send requests with one ore more cuts
1436	 * in the middle, this is the number of RDMAs starting
1437	 * after the last cut in the request. All previous
1438	 * segments before the last cut implicitly have 1 RDMA.
1439	 *
1440	 * Since the number of RDMAs is not known beforehand,
1441	 * it must be filled-in retroactively - after each
1442	 * segmentation cut or at the end of the entire packet.
1443	 */
1444
1445	while (busdma_seg_cnt) {
1446		/* Break the busdma segment up into pieces*/
1447		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1448		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1449		len = seg->ds_len;
1450
1451		while (len) {
1452			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1453			seglen = len;
1454			cum_len_next = cum_len + seglen;
1455			(req-rdma_count)->rdma_count = rdma_count + 1;
1456			if (__predict_true(cum_len >= 0)) {
1457				/* payload */
1458				chop = (cum_len_next > mss);
1459				cum_len_next = cum_len_next % mss;
1460				next_is_first = (cum_len_next == 0);
1461				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1462				flags_next |= next_is_first *
1463					MXGEFW_FLAGS_FIRST;
1464				rdma_count |= -(chop | next_is_first);
1465				rdma_count += chop & !next_is_first;
1466			} else if (cum_len_next >= 0) {
1467				/* header ends */
1468				rdma_count = -1;
1469				cum_len_next = 0;
1470				seglen = -cum_len;
1471				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1472				flags_next = MXGEFW_FLAGS_TSO_PLD |
1473					MXGEFW_FLAGS_FIRST |
1474					(small * MXGEFW_FLAGS_SMALL);
1475			    }
1476
1477			req->addr_high = high_swapped;
1478			req->addr_low = htobe32(low);
1479			req->pseudo_hdr_offset = pseudo_hdr_offset;
1480			req->pad = 0;
1481			req->rdma_count = 1;
1482			req->length = htobe16(seglen);
1483			req->cksum_offset = cksum_offset;
1484			req->flags = flags | ((cum_len & 1) *
1485					      MXGEFW_FLAGS_ALIGN_ODD);
1486			low += seglen;
1487			len -= seglen;
1488			cum_len = cum_len_next;
1489			flags = flags_next;
1490			req++;
1491			cnt++;
1492			rdma_count++;
1493			if (__predict_false(cksum_offset > seglen))
1494				cksum_offset -= seglen;
1495			else
1496				cksum_offset = 0;
1497			if (__predict_false(cnt > MXGE_MAX_SEND_DESC))
1498				goto drop;
1499		}
1500		busdma_seg_cnt--;
1501		seg++;
1502	}
1503	(req-rdma_count)->rdma_count = rdma_count;
1504
1505	do {
1506		req--;
1507		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1508	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1509
1510	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1511	if (tx->wc_fifo == NULL)
1512		mxge_submit_req(tx, tx->req_list, cnt);
1513	else
1514		mxge_submit_req_wc(tx, tx->req_list, cnt);
1515	return;
1516
1517drop:
1518	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1519	m_freem(m);
1520	sc->ifp->if_oerrors++;
1521	if (!once) {
1522		printf("MXGE_MAX_SEND_DESC exceeded via TSO!\n");
1523		printf("mss = %d, %ld!\n", mss, (long)seg - (long)tx->seg_list);
1524		once = 1;
1525	}
1526	return;
1527
1528}
1529
1530static void
1531mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1532{
1533	mcp_kreq_ether_send_t *req;
1534	bus_dma_segment_t *seg;
1535	struct mbuf *m_tmp;
1536	struct ifnet *ifp;
1537	mxge_tx_buf_t *tx;
1538	struct ether_header *eh;
1539	struct ip *ip;
1540	int cnt, cum_len, err, i, idx, odd_flag;
1541	uint16_t pseudo_hdr_offset;
1542        uint8_t flags, cksum_offset;
1543
1544
1545
1546	ifp = sc->ifp;
1547	tx = &sc->tx;
1548
1549	/* (try to) map the frame for DMA */
1550	idx = tx->req & tx->mask;
1551	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1552				      m, tx->seg_list, &cnt,
1553				      BUS_DMA_NOWAIT);
1554	if (err == EFBIG) {
1555		/* Too many segments in the chain.  Try
1556		   to defrag */
1557		m_tmp = m_defrag(m, M_NOWAIT);
1558		if (m_tmp == NULL) {
1559			goto drop;
1560		}
1561		m = m_tmp;
1562		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1563					      tx->info[idx].map,
1564					      m, tx->seg_list, &cnt,
1565					      BUS_DMA_NOWAIT);
1566	}
1567	if (err != 0) {
1568		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1569			      " packet len = %d\n", err, m->m_pkthdr.len);
1570		goto drop;
1571	}
1572	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1573			BUS_DMASYNC_PREWRITE);
1574	tx->info[idx].m = m;
1575
1576
1577	/* TSO is different enough, we handle it in another routine */
1578	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1579		mxge_encap_tso(sc, m, cnt);
1580		return;
1581	}
1582
1583	req = tx->req_list;
1584	cksum_offset = 0;
1585	pseudo_hdr_offset = 0;
1586	flags = MXGEFW_FLAGS_NO_TSO;
1587
1588	/* checksum offloading? */
1589	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1590		/* ensure ip header is in first mbuf, copy
1591		   it to a scratch buffer if not */
1592		if (__predict_false(m->m_len < sizeof (*eh)
1593				    + sizeof (*ip))) {
1594			m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1595				   sc->scratch);
1596			eh = (struct ether_header *)sc->scratch;
1597		} else {
1598			eh = mtod(m, struct ether_header *);
1599		}
1600		ip = (struct ip *) (eh + 1);
1601		cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1602		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1603		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1604		req->cksum_offset = cksum_offset;
1605		flags |= MXGEFW_FLAGS_CKSUM;
1606		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1607	} else {
1608		odd_flag = 0;
1609	}
1610	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1611		flags |= MXGEFW_FLAGS_SMALL;
1612
1613	/* convert segments into a request list */
1614	cum_len = 0;
1615	seg = tx->seg_list;
1616	req->flags = MXGEFW_FLAGS_FIRST;
1617	for (i = 0; i < cnt; i++) {
1618		req->addr_low =
1619			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1620		req->addr_high =
1621			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1622		req->length = htobe16(seg->ds_len);
1623		req->cksum_offset = cksum_offset;
1624		if (cksum_offset > seg->ds_len)
1625			cksum_offset -= seg->ds_len;
1626		else
1627			cksum_offset = 0;
1628		req->pseudo_hdr_offset = pseudo_hdr_offset;
1629		req->pad = 0; /* complete solid 16-byte block */
1630		req->rdma_count = 1;
1631		req->flags |= flags | ((cum_len & 1) * odd_flag);
1632		cum_len += seg->ds_len;
1633		seg++;
1634		req++;
1635		req->flags = 0;
1636	}
1637	req--;
1638	/* pad runts to 60 bytes */
1639	if (cum_len < 60) {
1640		req++;
1641		req->addr_low =
1642			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1643		req->addr_high =
1644			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1645		req->length = htobe16(60 - cum_len);
1646		req->cksum_offset = 0;
1647		req->pseudo_hdr_offset = pseudo_hdr_offset;
1648		req->pad = 0; /* complete solid 16-byte block */
1649		req->rdma_count = 1;
1650		req->flags |= flags | ((cum_len & 1) * odd_flag);
1651		cnt++;
1652	}
1653
1654	tx->req_list[0].rdma_count = cnt;
1655#if 0
1656	/* print what the firmware will see */
1657	for (i = 0; i < cnt; i++) {
1658		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1659		    "cso:%d, flags:0x%x, rdma:%d\n",
1660		    i, (int)ntohl(tx->req_list[i].addr_high),
1661		    (int)ntohl(tx->req_list[i].addr_low),
1662		    (int)ntohs(tx->req_list[i].length),
1663		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1664		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1665		    tx->req_list[i].rdma_count);
1666	}
1667	printf("--------------\n");
1668#endif
1669	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1670	if (tx->wc_fifo == NULL)
1671		mxge_submit_req(tx, tx->req_list, cnt);
1672	else
1673		mxge_submit_req_wc(tx, tx->req_list, cnt);
1674	return;
1675
1676drop:
1677	m_freem(m);
1678	ifp->if_oerrors++;
1679	return;
1680}
1681
1682
1683
1684
1685static inline void
1686mxge_start_locked(mxge_softc_t *sc)
1687{
1688	struct mbuf *m;
1689	struct ifnet *ifp;
1690
1691	ifp = sc->ifp;
1692	while ((sc->tx.mask - (sc->tx.req - sc->tx.done))
1693	       > MXGE_MAX_SEND_DESC) {
1694
1695		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1696		if (m == NULL) {
1697			return;
1698		}
1699		/* let BPF see it */
1700		BPF_MTAP(ifp, m);
1701
1702		/* give it to the nic */
1703		mxge_encap(sc, m);
1704	}
1705	/* ran out of transmit slots */
1706	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
1707		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1708		sc->tx.stall++;
1709	}
1710}
1711
1712static void
1713mxge_start(struct ifnet *ifp)
1714{
1715	mxge_softc_t *sc = ifp->if_softc;
1716
1717
1718	mtx_lock(&sc->tx_mtx);
1719	mxge_start_locked(sc);
1720	mtx_unlock(&sc->tx_mtx);
1721}
1722
1723/*
1724 * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1725 * at most 32 bytes at a time, so as to avoid involving the software
1726 * pio handler in the nic.   We re-write the first segment's low
1727 * DMA address to mark it valid only after we write the entire chunk
1728 * in a burst
1729 */
1730static inline void
1731mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1732		mcp_kreq_ether_recv_t *src)
1733{
1734	uint32_t low;
1735
1736	low = src->addr_low;
1737	src->addr_low = 0xffffffff;
1738	mxge_pio_copy(dst, src, 4 * sizeof (*src));
1739	mb();
1740	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
1741	mb();
1742	src->addr_low = low;
1743	dst->addr_low = low;
1744	mb();
1745}
1746
1747static int
1748mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1749{
1750	bus_dma_segment_t seg;
1751	struct mbuf *m;
1752	mxge_rx_buf_t *rx = &sc->rx_small;
1753	int cnt, err;
1754
1755	m = m_gethdr(M_DONTWAIT, MT_DATA);
1756	if (m == NULL) {
1757		rx->alloc_fail++;
1758		err = ENOBUFS;
1759		goto done;
1760	}
1761	m->m_len = MHLEN;
1762	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1763				      &seg, &cnt, BUS_DMA_NOWAIT);
1764	if (err != 0) {
1765		m_free(m);
1766		goto done;
1767	}
1768	rx->info[idx].m = m;
1769	rx->shadow[idx].addr_low =
1770		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1771	rx->shadow[idx].addr_high =
1772		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1773
1774done:
1775	if ((idx & 7) == 7) {
1776		if (rx->wc_fifo == NULL)
1777			mxge_submit_8rx(&rx->lanai[idx - 7],
1778					&rx->shadow[idx - 7]);
1779		else {
1780			mb();
1781			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1782		}
1783        }
1784	return err;
1785}
1786
1787static int
1788mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1789{
1790	bus_dma_segment_t seg;
1791	struct mbuf *m;
1792	mxge_rx_buf_t *rx = &sc->rx_big;
1793	int cnt, err;
1794
1795	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, sc->big_bytes);
1796	if (m == NULL) {
1797		rx->alloc_fail++;
1798		err = ENOBUFS;
1799		goto done;
1800	}
1801	m->m_len = sc->big_bytes;
1802	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1803				      &seg, &cnt, BUS_DMA_NOWAIT);
1804	if (err != 0) {
1805		m_free(m);
1806		goto done;
1807	}
1808	rx->info[idx].m = m;
1809	rx->shadow[idx].addr_low =
1810		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1811	rx->shadow[idx].addr_high =
1812		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1813
1814done:
1815	if ((idx & 7) == 7) {
1816		if (rx->wc_fifo == NULL)
1817			mxge_submit_8rx(&rx->lanai[idx - 7],
1818					&rx->shadow[idx - 7]);
1819		else {
1820			mb();
1821			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1822		}
1823        }
1824	return err;
1825}
1826
1827static inline void
1828mxge_rx_csum(struct mbuf *m, int csum)
1829{
1830	struct ether_header *eh;
1831	struct ip *ip;
1832
1833	eh = mtod(m, struct ether_header *);
1834	if (__predict_true(eh->ether_type ==  htons(ETHERTYPE_IP))) {
1835		ip = (struct ip *)(eh + 1);
1836		if (__predict_true(ip->ip_p == IPPROTO_TCP ||
1837				   ip->ip_p == IPPROTO_UDP)) {
1838			m->m_pkthdr.csum_data = csum;
1839			m->m_pkthdr.csum_flags = CSUM_DATA_VALID;
1840		}
1841	}
1842}
1843
1844static inline void
1845mxge_rx_done_big(mxge_softc_t *sc, int len, int csum)
1846{
1847	struct ifnet *ifp;
1848	struct mbuf *m = 0; 		/* -Wunitialized */
1849	struct mbuf *m_prev = 0;	/* -Wunitialized */
1850	struct mbuf *m_head = 0;
1851	bus_dmamap_t old_map;
1852	mxge_rx_buf_t *rx;
1853	int idx;
1854
1855
1856	rx = &sc->rx_big;
1857	ifp = sc->ifp;
1858	while (len > 0) {
1859		idx = rx->cnt & rx->mask;
1860                rx->cnt++;
1861		/* save a pointer to the received mbuf */
1862		m = rx->info[idx].m;
1863		/* try to replace the received mbuf */
1864		if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
1865			goto drop;
1866		}
1867		/* unmap the received buffer */
1868		old_map = rx->info[idx].map;
1869		bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1870		bus_dmamap_unload(rx->dmat, old_map);
1871
1872		/* swap the bus_dmamap_t's */
1873		rx->info[idx].map = rx->extra_map;
1874		rx->extra_map = old_map;
1875
1876		/* chain multiple segments together */
1877		if (!m_head) {
1878			m_head = m;
1879			/* mcp implicitly skips 1st bytes so that
1880			 * packet is properly aligned */
1881			m->m_data += MXGEFW_PAD;
1882			m->m_pkthdr.len = len;
1883			m->m_len = sc->big_bytes - MXGEFW_PAD;
1884		} else {
1885			m->m_len = sc->big_bytes;
1886			m->m_flags &= ~M_PKTHDR;
1887			m_prev->m_next = m;
1888		}
1889		len -= m->m_len;
1890		m_prev = m;
1891	}
1892
1893	/* trim trailing garbage from the last mbuf in the chain.  If
1894	 * there is any garbage, len will be negative */
1895	m->m_len += len;
1896
1897	/* if the checksum is valid, mark it in the mbuf header */
1898	if (sc->csum_flag)
1899		mxge_rx_csum(m_head, csum);
1900
1901	/* pass the frame up the stack */
1902	m_head->m_pkthdr.rcvif = ifp;
1903	ifp->if_ipackets++;
1904	(*ifp->if_input)(ifp, m_head);
1905	return;
1906
1907drop:
1908	/* drop the frame -- the old mbuf(s) are re-cycled by running
1909	   every slot through the allocator */
1910        if (m_head) {
1911                len -= sc->big_bytes;
1912                m_freem(m_head);
1913        } else {
1914                len -= (sc->big_bytes + MXGEFW_PAD);
1915        }
1916        while ((int)len > 0) {
1917                idx = rx->cnt & rx->mask;
1918                rx->cnt++;
1919                m = rx->info[idx].m;
1920                if (0 == (mxge_get_buf_big(sc, rx->extra_map, idx))) {
1921			m_freem(m);
1922			/* unmap the received buffer */
1923			old_map = rx->info[idx].map;
1924			bus_dmamap_sync(rx->dmat, old_map,
1925					BUS_DMASYNC_POSTREAD);
1926			bus_dmamap_unload(rx->dmat, old_map);
1927
1928			/* swap the bus_dmamap_t's */
1929			rx->info[idx].map = rx->extra_map;
1930			rx->extra_map = old_map;
1931		}
1932                len -= sc->big_bytes;
1933        }
1934
1935	ifp->if_ierrors++;
1936
1937}
1938
1939static inline void
1940mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
1941{
1942	struct ifnet *ifp;
1943	struct mbuf *m;
1944	mxge_rx_buf_t *rx;
1945	bus_dmamap_t old_map;
1946	int idx;
1947
1948	ifp = sc->ifp;
1949	rx = &sc->rx_small;
1950	idx = rx->cnt & rx->mask;
1951	rx->cnt++;
1952	/* save a pointer to the received mbuf */
1953	m = rx->info[idx].m;
1954	/* try to replace the received mbuf */
1955	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
1956		/* drop the frame -- the old mbuf is re-cycled */
1957		ifp->if_ierrors++;
1958		return;
1959	}
1960
1961	/* unmap the received buffer */
1962	old_map = rx->info[idx].map;
1963	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1964	bus_dmamap_unload(rx->dmat, old_map);
1965
1966	/* swap the bus_dmamap_t's */
1967	rx->info[idx].map = rx->extra_map;
1968	rx->extra_map = old_map;
1969
1970	/* mcp implicitly skips 1st 2 bytes so that packet is properly
1971	 * aligned */
1972	m->m_data += MXGEFW_PAD;
1973
1974	/* if the checksum is valid, mark it in the mbuf header */
1975	if (sc->csum_flag)
1976		mxge_rx_csum(m, csum);
1977
1978	/* pass the frame up the stack */
1979	m->m_pkthdr.rcvif = ifp;
1980	m->m_len = m->m_pkthdr.len = len;
1981	ifp->if_ipackets++;
1982	(*ifp->if_input)(ifp, m);
1983}
1984
1985static inline void
1986mxge_clean_rx_done(mxge_softc_t *sc)
1987{
1988	mxge_rx_done_t *rx_done = &sc->rx_done;
1989	int limit = 0;
1990	uint16_t length;
1991	uint16_t checksum;
1992
1993
1994	while (rx_done->entry[rx_done->idx].length != 0) {
1995		length = ntohs(rx_done->entry[rx_done->idx].length);
1996		rx_done->entry[rx_done->idx].length = 0;
1997		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
1998		if (length <= (MHLEN - MXGEFW_PAD))
1999			mxge_rx_done_small(sc, length, checksum);
2000		else
2001			mxge_rx_done_big(sc, length, checksum);
2002		rx_done->cnt++;
2003		rx_done->idx = rx_done->cnt & (mxge_max_intr_slots - 1);
2004
2005		/* limit potential for livelock */
2006		if (__predict_false(++limit > 2 * mxge_max_intr_slots))
2007			break;
2008
2009	}
2010}
2011
2012
2013static inline void
2014mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
2015{
2016	struct ifnet *ifp;
2017	mxge_tx_buf_t *tx;
2018	struct mbuf *m;
2019	bus_dmamap_t map;
2020	int idx, limit;
2021
2022	limit = 0;
2023	tx = &sc->tx;
2024	ifp = sc->ifp;
2025	while (tx->pkt_done != mcp_idx) {
2026		idx = tx->done & tx->mask;
2027		tx->done++;
2028		m = tx->info[idx].m;
2029		/* mbuf and DMA map only attached to the first
2030		   segment per-mbuf */
2031		if (m != NULL) {
2032			ifp->if_opackets++;
2033			tx->info[idx].m = NULL;
2034			map = tx->info[idx].map;
2035			bus_dmamap_unload(tx->dmat, map);
2036			m_freem(m);
2037		}
2038		if (tx->info[idx].flag) {
2039			tx->info[idx].flag = 0;
2040			tx->pkt_done++;
2041		}
2042		/* limit potential for livelock by only handling
2043		   2 full tx rings per call */
2044		if (__predict_false(++limit >  2 * tx->mask))
2045			break;
2046	}
2047
2048	/* If we have space, clear IFF_OACTIVE to tell the stack that
2049           its OK to send packets */
2050
2051	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2052	    tx->req - tx->done < (tx->mask + 1)/4) {
2053		mtx_lock(&sc->tx_mtx);
2054		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2055		sc->tx.wake++;
2056		mxge_start_locked(sc);
2057		mtx_unlock(&sc->tx_mtx);
2058	}
2059}
2060
2061static void
2062mxge_intr(void *arg)
2063{
2064	mxge_softc_t *sc = arg;
2065	mcp_irq_data_t *stats = sc->fw_stats;
2066	mxge_tx_buf_t *tx = &sc->tx;
2067	mxge_rx_done_t *rx_done = &sc->rx_done;
2068	uint32_t send_done_count;
2069	uint8_t valid;
2070
2071
2072	/* make sure the DMA has finished */
2073	if (!stats->valid) {
2074		return;
2075	}
2076	valid = stats->valid;
2077
2078	if (!sc->msi_enabled) {
2079		/* lower legacy IRQ  */
2080		*sc->irq_deassert = 0;
2081		if (!mxge_deassert_wait)
2082			/* don't wait for conf. that irq is low */
2083			stats->valid = 0;
2084	} else {
2085		stats->valid = 0;
2086	}
2087
2088	/* loop while waiting for legacy irq deassertion */
2089	do {
2090		/* check for transmit completes and receives */
2091		send_done_count = be32toh(stats->send_done_count);
2092		while ((send_done_count != tx->pkt_done) ||
2093		       (rx_done->entry[rx_done->idx].length != 0)) {
2094			mxge_tx_done(sc, (int)send_done_count);
2095			mxge_clean_rx_done(sc);
2096			send_done_count = be32toh(stats->send_done_count);
2097		}
2098	} while (*((volatile uint8_t *) &stats->valid));
2099
2100	if (__predict_false(stats->stats_updated)) {
2101		if (sc->link_state != stats->link_up) {
2102			sc->link_state = stats->link_up;
2103			if (sc->link_state) {
2104				if_link_state_change(sc->ifp, LINK_STATE_UP);
2105				if (mxge_verbose)
2106					device_printf(sc->dev, "link up\n");
2107			} else {
2108				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2109				if (mxge_verbose)
2110					device_printf(sc->dev, "link down\n");
2111			}
2112		}
2113		if (sc->rdma_tags_available !=
2114		    be32toh(sc->fw_stats->rdma_tags_available)) {
2115			sc->rdma_tags_available =
2116				be32toh(sc->fw_stats->rdma_tags_available);
2117			device_printf(sc->dev, "RDMA timed out! %d tags "
2118				      "left\n", sc->rdma_tags_available);
2119		}
2120		sc->down_cnt += stats->link_down;
2121	}
2122
2123	/* check to see if we have rx token to pass back */
2124	if (valid & 0x1)
2125	    *sc->irq_claim = be32toh(3);
2126	*(sc->irq_claim + 1) = be32toh(3);
2127}
2128
2129static void
2130mxge_init(void *arg)
2131{
2132}
2133
2134
2135
2136static void
2137mxge_free_mbufs(mxge_softc_t *sc)
2138{
2139	int i;
2140
2141	for (i = 0; i <= sc->rx_big.mask; i++) {
2142		if (sc->rx_big.info[i].m == NULL)
2143			continue;
2144		bus_dmamap_unload(sc->rx_big.dmat,
2145				  sc->rx_big.info[i].map);
2146		m_freem(sc->rx_big.info[i].m);
2147		sc->rx_big.info[i].m = NULL;
2148	}
2149
2150	for (i = 0; i <= sc->rx_big.mask; i++) {
2151		if (sc->rx_big.info[i].m == NULL)
2152			continue;
2153		bus_dmamap_unload(sc->rx_big.dmat,
2154				  sc->rx_big.info[i].map);
2155		m_freem(sc->rx_big.info[i].m);
2156		sc->rx_big.info[i].m = NULL;
2157	}
2158
2159	for (i = 0; i <= sc->tx.mask; i++) {
2160		sc->tx.info[i].flag = 0;
2161		if (sc->tx.info[i].m == NULL)
2162			continue;
2163		bus_dmamap_unload(sc->tx.dmat,
2164				  sc->tx.info[i].map);
2165		m_freem(sc->tx.info[i].m);
2166		sc->tx.info[i].m = NULL;
2167	}
2168}
2169
2170static void
2171mxge_free_rings(mxge_softc_t *sc)
2172{
2173	int i;
2174
2175	if (sc->tx.req_bytes != NULL)
2176		free(sc->tx.req_bytes, M_DEVBUF);
2177	if (sc->tx.seg_list != NULL)
2178		free(sc->tx.seg_list, M_DEVBUF);
2179	if (sc->rx_small.shadow != NULL)
2180		free(sc->rx_small.shadow, M_DEVBUF);
2181	if (sc->rx_big.shadow != NULL)
2182		free(sc->rx_big.shadow, M_DEVBUF);
2183	if (sc->tx.info != NULL) {
2184		if (sc->tx.dmat != NULL) {
2185			for (i = 0; i <= sc->tx.mask; i++) {
2186				bus_dmamap_destroy(sc->tx.dmat,
2187						   sc->tx.info[i].map);
2188			}
2189			bus_dma_tag_destroy(sc->tx.dmat);
2190		}
2191		free(sc->tx.info, M_DEVBUF);
2192	}
2193	if (sc->rx_small.info != NULL) {
2194		if (sc->rx_small.dmat != NULL) {
2195			for (i = 0; i <= sc->rx_small.mask; i++) {
2196				bus_dmamap_destroy(sc->rx_small.dmat,
2197						   sc->rx_small.info[i].map);
2198			}
2199			bus_dmamap_destroy(sc->rx_small.dmat,
2200					   sc->rx_small.extra_map);
2201			bus_dma_tag_destroy(sc->rx_small.dmat);
2202		}
2203		free(sc->rx_small.info, M_DEVBUF);
2204	}
2205	if (sc->rx_big.info != NULL) {
2206		if (sc->rx_big.dmat != NULL) {
2207			for (i = 0; i <= sc->rx_big.mask; i++) {
2208				bus_dmamap_destroy(sc->rx_big.dmat,
2209						   sc->rx_big.info[i].map);
2210			}
2211			bus_dmamap_destroy(sc->rx_big.dmat,
2212					   sc->rx_big.extra_map);
2213			bus_dma_tag_destroy(sc->rx_big.dmat);
2214		}
2215		free(sc->rx_big.info, M_DEVBUF);
2216	}
2217}
2218
2219static int
2220mxge_alloc_rings(mxge_softc_t *sc)
2221{
2222	mxge_cmd_t cmd;
2223	int tx_ring_size, rx_ring_size;
2224	int tx_ring_entries, rx_ring_entries;
2225	int i, err;
2226	unsigned long bytes;
2227
2228	/* get ring sizes */
2229	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2230	tx_ring_size = cmd.data0;
2231	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
2232	if (err != 0) {
2233		device_printf(sc->dev, "Cannot determine ring sizes\n");
2234		goto abort_with_nothing;
2235	}
2236
2237	rx_ring_size = cmd.data0;
2238
2239	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
2240	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
2241	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
2242	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
2243	IFQ_SET_READY(&sc->ifp->if_snd);
2244
2245	sc->tx.mask = tx_ring_entries - 1;
2246	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
2247
2248	err = ENOMEM;
2249
2250	/* allocate the tx request copy block */
2251	bytes = 8 +
2252		sizeof (*sc->tx.req_list) * (MXGE_MAX_SEND_DESC + 4);
2253	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2254	if (sc->tx.req_bytes == NULL)
2255		goto abort_with_nothing;
2256	/* ensure req_list entries are aligned to 8 bytes */
2257	sc->tx.req_list = (mcp_kreq_ether_send_t *)
2258		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
2259
2260	/* allocate the tx busdma segment list */
2261	bytes = sizeof (*sc->tx.seg_list) * MXGE_MAX_SEND_DESC;
2262	sc->tx.seg_list = (bus_dma_segment_t *)
2263		malloc(bytes, M_DEVBUF, M_WAITOK);
2264	if (sc->tx.seg_list == NULL)
2265		goto abort_with_alloc;
2266
2267	/* allocate the rx shadow rings */
2268	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
2269	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2270	if (sc->rx_small.shadow == NULL)
2271		goto abort_with_alloc;
2272
2273	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
2274	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2275	if (sc->rx_big.shadow == NULL)
2276		goto abort_with_alloc;
2277
2278	/* allocate the host info rings */
2279	bytes = tx_ring_entries * sizeof (*sc->tx.info);
2280	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2281	if (sc->tx.info == NULL)
2282		goto abort_with_alloc;
2283
2284	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
2285	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2286	if (sc->rx_small.info == NULL)
2287		goto abort_with_alloc;
2288
2289	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
2290	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2291	if (sc->rx_big.info == NULL)
2292		goto abort_with_alloc;
2293
2294	/* allocate the busdma resources */
2295	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2296				 1,			/* alignment */
2297				 sc->tx.boundary,	/* boundary */
2298				 BUS_SPACE_MAXADDR,	/* low */
2299				 BUS_SPACE_MAXADDR,	/* high */
2300				 NULL, NULL,		/* filter */
2301				 65536 + 256,		/* maxsize */
2302				 MXGE_MAX_SEND_DESC/2,	/* num segs */
2303				 sc->tx.boundary,	/* maxsegsize */
2304				 BUS_DMA_ALLOCNOW,	/* flags */
2305				 NULL, NULL,		/* lock */
2306				 &sc->tx.dmat);		/* tag */
2307
2308	if (err != 0) {
2309		device_printf(sc->dev, "Err %d allocating tx dmat\n",
2310			      err);
2311		goto abort_with_alloc;
2312	}
2313
2314	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2315				 1,			/* alignment */
2316				 4096,			/* boundary */
2317				 BUS_SPACE_MAXADDR,	/* low */
2318				 BUS_SPACE_MAXADDR,	/* high */
2319				 NULL, NULL,		/* filter */
2320				 MHLEN,			/* maxsize */
2321				 1,			/* num segs */
2322				 MHLEN,			/* maxsegsize */
2323				 BUS_DMA_ALLOCNOW,	/* flags */
2324				 NULL, NULL,		/* lock */
2325				 &sc->rx_small.dmat);	/* tag */
2326	if (err != 0) {
2327		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2328			      err);
2329		goto abort_with_alloc;
2330	}
2331
2332	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2333				 1,			/* alignment */
2334				 4096,			/* boundary */
2335				 BUS_SPACE_MAXADDR,	/* low */
2336				 BUS_SPACE_MAXADDR,	/* high */
2337				 NULL, NULL,		/* filter */
2338				 4096,			/* maxsize */
2339				 1,			/* num segs */
2340				 4096,			/* maxsegsize */
2341				 BUS_DMA_ALLOCNOW,	/* flags */
2342				 NULL, NULL,		/* lock */
2343				 &sc->rx_big.dmat);	/* tag */
2344	if (err != 0) {
2345		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2346			      err);
2347		goto abort_with_alloc;
2348	}
2349
2350	/* now use these tags to setup dmamaps for each slot
2351	   in each ring */
2352	for (i = 0; i <= sc->tx.mask; i++) {
2353		err = bus_dmamap_create(sc->tx.dmat, 0,
2354					&sc->tx.info[i].map);
2355		if (err != 0) {
2356			device_printf(sc->dev, "Err %d  tx dmamap\n",
2357			      err);
2358			goto abort_with_alloc;
2359		}
2360	}
2361	for (i = 0; i <= sc->rx_small.mask; i++) {
2362		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2363					&sc->rx_small.info[i].map);
2364		if (err != 0) {
2365			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2366				      err);
2367			goto abort_with_alloc;
2368		}
2369	}
2370	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2371				&sc->rx_small.extra_map);
2372	if (err != 0) {
2373		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2374			      err);
2375			goto abort_with_alloc;
2376	}
2377
2378	for (i = 0; i <= sc->rx_big.mask; i++) {
2379		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2380					&sc->rx_big.info[i].map);
2381		if (err != 0) {
2382			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2383			      err);
2384			goto abort_with_alloc;
2385		}
2386	}
2387	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2388				&sc->rx_big.extra_map);
2389	if (err != 0) {
2390		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2391			      err);
2392			goto abort_with_alloc;
2393	}
2394	return 0;
2395
2396abort_with_alloc:
2397	mxge_free_rings(sc);
2398
2399abort_with_nothing:
2400	return err;
2401}
2402
2403static int
2404mxge_open(mxge_softc_t *sc)
2405{
2406	mxge_cmd_t cmd;
2407	int i, err;
2408	bus_dmamap_t map;
2409	bus_addr_t bus;
2410
2411
2412	/* Copy the MAC address in case it was overridden */
2413	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
2414
2415	err = mxge_reset(sc);
2416	if (err != 0) {
2417		device_printf(sc->dev, "failed to reset\n");
2418		return EIO;
2419	}
2420	bzero(sc->rx_done.entry,
2421	      mxge_max_intr_slots * sizeof(*sc->rx_done.entry));
2422
2423	if (MCLBYTES >=
2424	    sc->ifp->if_mtu + ETHER_HDR_LEN + MXGEFW_PAD)
2425		sc->big_bytes = MCLBYTES;
2426	else
2427		sc->big_bytes = MJUMPAGESIZE;
2428
2429
2430	/* get the lanai pointers to the send and receive rings */
2431
2432	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2433	sc->tx.lanai =
2434		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2435	err |= mxge_send_cmd(sc,
2436				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2437	sc->rx_small.lanai =
2438		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2439	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2440	sc->rx_big.lanai =
2441		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2442
2443	if (err != 0) {
2444		device_printf(sc->dev,
2445			      "failed to get ring sizes or locations\n");
2446		return EIO;
2447	}
2448
2449	if (sc->wc) {
2450		sc->tx.wc_fifo = sc->sram + MXGEFW_ETH_SEND_4;
2451		sc->rx_small.wc_fifo = sc->sram + MXGEFW_ETH_RECV_SMALL;
2452		sc->rx_big.wc_fifo = sc->sram + MXGEFW_ETH_RECV_BIG;
2453	} else {
2454		sc->tx.wc_fifo = 0;
2455		sc->rx_small.wc_fifo = 0;
2456		sc->rx_big.wc_fifo = 0;
2457	}
2458
2459
2460	/* stock receive rings */
2461	for (i = 0; i <= sc->rx_small.mask; i++) {
2462		map = sc->rx_small.info[i].map;
2463		err = mxge_get_buf_small(sc, map, i);
2464		if (err) {
2465			device_printf(sc->dev, "alloced %d/%d smalls\n",
2466				      i, sc->rx_small.mask + 1);
2467			goto abort;
2468		}
2469	}
2470	for (i = 0; i <= sc->rx_big.mask; i++) {
2471		map = sc->rx_big.info[i].map;
2472		err = mxge_get_buf_big(sc, map, i);
2473		if (err) {
2474			device_printf(sc->dev, "alloced %d/%d bigs\n",
2475				      i, sc->rx_big.mask + 1);
2476			goto abort;
2477		}
2478	}
2479
2480	/* Give the firmware the mtu and the big and small buffer
2481	   sizes.  The firmware wants the big buf size to be a power
2482	   of two. Luckily, FreeBSD's clusters are powers of two */
2483	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN;
2484	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2485	cmd.data0 = MHLEN - MXGEFW_PAD;
2486	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2487			     &cmd);
2488	cmd.data0 = sc->big_bytes;
2489	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2490
2491	if (err != 0) {
2492		device_printf(sc->dev, "failed to setup params\n");
2493		goto abort;
2494	}
2495
2496	/* Now give him the pointer to the stats block */
2497	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
2498	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
2499	cmd.data2 = sizeof(struct mcp_irq_data);
2500	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
2501
2502	if (err != 0) {
2503		bus = sc->fw_stats_dma.bus_addr;
2504		bus += offsetof(struct mcp_irq_data, send_done_count);
2505		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
2506		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
2507		err = mxge_send_cmd(sc,
2508				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
2509				    &cmd);
2510		/* Firmware cannot support multicast without STATS_DMA_V2 */
2511		sc->fw_multicast_support = 0;
2512	} else {
2513		sc->fw_multicast_support = 1;
2514	}
2515
2516	if (err != 0) {
2517		device_printf(sc->dev, "failed to setup params\n");
2518		goto abort;
2519	}
2520
2521	/* Finally, start the firmware running */
2522	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
2523	if (err) {
2524		device_printf(sc->dev, "Couldn't bring up link\n");
2525		goto abort;
2526	}
2527	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
2528	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2529
2530	return 0;
2531
2532
2533abort:
2534	mxge_free_mbufs(sc);
2535
2536	return err;
2537}
2538
2539static int
2540mxge_close(mxge_softc_t *sc)
2541{
2542	mxge_cmd_t cmd;
2543	int err, old_down_cnt;
2544
2545	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2546	old_down_cnt = sc->down_cnt;
2547	mb();
2548	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2549	if (err) {
2550		device_printf(sc->dev, "Couldn't bring down link\n");
2551	}
2552	if (old_down_cnt == sc->down_cnt) {
2553		/* wait for down irq */
2554		DELAY(10 * sc->intr_coal_delay);
2555	}
2556	if (old_down_cnt == sc->down_cnt) {
2557		device_printf(sc->dev, "never got down irq\n");
2558	}
2559
2560	mxge_free_mbufs(sc);
2561
2562	return 0;
2563}
2564
2565static void
2566mxge_setup_cfg_space(mxge_softc_t *sc)
2567{
2568	device_t dev = sc->dev;
2569	int reg;
2570	uint16_t cmd, lnk, pectl;
2571
2572	/* find the PCIe link width and set max read request to 4KB*/
2573	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
2574		lnk = pci_read_config(dev, reg + 0x12, 2);
2575		sc->link_width = (lnk >> 4) & 0x3f;
2576
2577		pectl = pci_read_config(dev, reg + 0x8, 2);
2578		pectl = (pectl & ~0x7000) | (5 << 12);
2579		pci_write_config(dev, reg + 0x8, pectl, 2);
2580	}
2581
2582	/* Enable DMA and Memory space access */
2583	pci_enable_busmaster(dev);
2584	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
2585	cmd |= PCIM_CMD_MEMEN;
2586	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
2587}
2588
2589static uint32_t
2590mxge_read_reboot(mxge_softc_t *sc)
2591{
2592	device_t dev = sc->dev;
2593	uint32_t vs;
2594
2595	/* find the vendor specific offset */
2596	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
2597		device_printf(sc->dev,
2598			      "could not find vendor specific offset\n");
2599		return (uint32_t)-1;
2600	}
2601	/* enable read32 mode */
2602	pci_write_config(dev, vs + 0x10, 0x3, 1);
2603	/* tell NIC which register to read */
2604	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
2605	return (pci_read_config(dev, vs + 0x14, 4));
2606}
2607
2608static void
2609mxge_watchdog_reset(mxge_softc_t *sc)
2610{
2611	int err;
2612	uint32_t reboot;
2613	uint16_t cmd;
2614
2615	err = ENXIO;
2616
2617	device_printf(sc->dev, "Watchdog reset!\n");
2618
2619	/*
2620	 * check to see if the NIC rebooted.  If it did, then all of
2621	 * PCI config space has been reset, and things like the
2622	 * busmaster bit will be zero.  If this is the case, then we
2623	 * must restore PCI config space before the NIC can be used
2624	 * again
2625	 */
2626	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2627	if (cmd == 0xffff) {
2628		/*
2629		 * maybe the watchdog caught the NIC rebooting; wait
2630		 * up to 100ms for it to finish.  If it does not come
2631		 * back, then give up
2632		 */
2633		DELAY(1000*100);
2634		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2635		if (cmd == 0xffff) {
2636			device_printf(sc->dev, "NIC disappeared!\n");
2637			goto abort;
2638		}
2639	}
2640	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
2641		/* print the reboot status */
2642		reboot = mxge_read_reboot(sc);
2643		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
2644			      reboot);
2645		/* restore PCI configuration space */
2646
2647		/* XXXX waiting for pci_cfg_restore() to be exported */
2648		goto abort; /* just abort for now */
2649
2650		/* and redo any changes we made to our config space */
2651		mxge_setup_cfg_space(sc);
2652	} else {
2653		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
2654		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
2655			      sc->tx.req, sc->tx.done);
2656		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
2657			      sc->tx.pkt_done,
2658			      be32toh(sc->fw_stats->send_done_count));
2659	}
2660
2661	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
2662		mxge_close(sc);
2663		err = mxge_open(sc);
2664	}
2665
2666abort:
2667	/*
2668	 * stop the watchdog if the nic is dead, to avoid spamming the
2669	 * console
2670	 */
2671	if (err != 0) {
2672		callout_stop(&sc->co_hdl);
2673	}
2674}
2675
2676static void
2677mxge_watchdog(mxge_softc_t *sc)
2678{
2679	mxge_tx_buf_t *tx = &sc->tx;
2680
2681	/* see if we have outstanding transmits, which
2682	   have been pending for more than mxge_ticks */
2683	if (tx->req != tx->done &&
2684	    tx->watchdog_req != tx->watchdog_done &&
2685	    tx->done == tx->watchdog_done)
2686		mxge_watchdog_reset(sc);
2687
2688	tx->watchdog_req = tx->req;
2689	tx->watchdog_done = tx->done;
2690}
2691
2692static void
2693mxge_tick(void *arg)
2694{
2695	mxge_softc_t *sc = arg;
2696
2697
2698	/* Synchronize with possible callout reset/stop. */
2699	if (callout_pending(&sc->co_hdl) ||
2700	    !callout_active(&sc->co_hdl)) {
2701		mtx_unlock(&sc->driver_mtx);
2702		return;
2703	}
2704
2705	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2706	mxge_watchdog(sc);
2707}
2708
2709static int
2710mxge_media_change(struct ifnet *ifp)
2711{
2712	return EINVAL;
2713}
2714
2715static int
2716mxge_change_mtu(mxge_softc_t *sc, int mtu)
2717{
2718	struct ifnet *ifp = sc->ifp;
2719	int real_mtu, old_mtu;
2720	int err = 0;
2721
2722
2723	real_mtu = mtu + ETHER_HDR_LEN;
2724	if ((real_mtu > MXGE_MAX_ETHER_MTU) ||
2725	    real_mtu < 60)
2726		return EINVAL;
2727	mtx_lock(&sc->driver_mtx);
2728	old_mtu = ifp->if_mtu;
2729	ifp->if_mtu = mtu;
2730	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2731		callout_stop(&sc->co_hdl);
2732		mxge_close(sc);
2733		err = mxge_open(sc);
2734		if (err != 0) {
2735			ifp->if_mtu = old_mtu;
2736			mxge_close(sc);
2737			(void) mxge_open(sc);
2738		}
2739		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2740	}
2741	mtx_unlock(&sc->driver_mtx);
2742	return err;
2743}
2744
2745static void
2746mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
2747{
2748	mxge_softc_t *sc = ifp->if_softc;
2749
2750
2751	if (sc == NULL)
2752		return;
2753	ifmr->ifm_status = IFM_AVALID;
2754	ifmr->ifm_status |= sc->fw_stats->link_up ? IFM_ACTIVE : 0;
2755	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
2756	ifmr->ifm_active |= sc->fw_stats->link_up ? IFM_FDX : 0;
2757}
2758
2759static int
2760mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
2761{
2762	mxge_softc_t *sc = ifp->if_softc;
2763	struct ifreq *ifr = (struct ifreq *)data;
2764	int err, mask;
2765
2766	err = 0;
2767	switch (command) {
2768	case SIOCSIFADDR:
2769	case SIOCGIFADDR:
2770		err = ether_ioctl(ifp, command, data);
2771		break;
2772
2773	case SIOCSIFMTU:
2774		err = mxge_change_mtu(sc, ifr->ifr_mtu);
2775		break;
2776
2777	case SIOCSIFFLAGS:
2778		mtx_lock(&sc->driver_mtx);
2779		if (ifp->if_flags & IFF_UP) {
2780			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
2781				err = mxge_open(sc);
2782				callout_reset(&sc->co_hdl, mxge_ticks,
2783					      mxge_tick, sc);
2784			} else {
2785				/* take care of promis can allmulti
2786				   flag chages */
2787				mxge_change_promisc(sc,
2788						    ifp->if_flags & IFF_PROMISC);
2789				mxge_set_multicast_list(sc);
2790			}
2791		} else {
2792			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2793				mxge_close(sc);
2794				callout_stop(&sc->co_hdl);
2795			}
2796		}
2797		mtx_unlock(&sc->driver_mtx);
2798		break;
2799
2800	case SIOCADDMULTI:
2801	case SIOCDELMULTI:
2802		mtx_lock(&sc->driver_mtx);
2803		mxge_set_multicast_list(sc);
2804		mtx_unlock(&sc->driver_mtx);
2805		break;
2806
2807	case SIOCSIFCAP:
2808		mtx_lock(&sc->driver_mtx);
2809		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2810		if (mask & IFCAP_TXCSUM) {
2811			if (IFCAP_TXCSUM & ifp->if_capenable) {
2812				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
2813				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
2814						      | CSUM_TSO);
2815			} else {
2816				ifp->if_capenable |= IFCAP_TXCSUM;
2817				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
2818			}
2819		} else if (mask & IFCAP_RXCSUM) {
2820			if (IFCAP_RXCSUM & ifp->if_capenable) {
2821				ifp->if_capenable &= ~IFCAP_RXCSUM;
2822				sc->csum_flag = 0;
2823			} else {
2824				ifp->if_capenable |= IFCAP_RXCSUM;
2825				sc->csum_flag = 1;
2826			}
2827		}
2828		if (mask & IFCAP_TSO4) {
2829			if (IFCAP_TSO4 & ifp->if_capenable) {
2830				ifp->if_capenable &= ~IFCAP_TSO4;
2831				ifp->if_hwassist &= ~CSUM_TSO;
2832			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
2833				ifp->if_capenable |= IFCAP_TSO4;
2834				ifp->if_hwassist |= CSUM_TSO;
2835			} else {
2836				printf("mxge requires tx checksum offload"
2837				       " be enabled to use TSO\n");
2838				err = EINVAL;
2839			}
2840		}
2841		mtx_unlock(&sc->driver_mtx);
2842		break;
2843
2844	case SIOCGIFMEDIA:
2845		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
2846				    &sc->media, command);
2847                break;
2848
2849	default:
2850		err = ENOTTY;
2851        }
2852	return err;
2853}
2854
2855static void
2856mxge_fetch_tunables(mxge_softc_t *sc)
2857{
2858
2859	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
2860			  &mxge_flow_control);
2861	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
2862			  &mxge_intr_coal_delay);
2863	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
2864			  &mxge_nvidia_ecrc_enable);
2865	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
2866			  &mxge_force_firmware);
2867	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
2868			  &mxge_deassert_wait);
2869	TUNABLE_INT_FETCH("hw.mxge.verbose",
2870			  &mxge_verbose);
2871	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
2872
2873	if (bootverbose)
2874		mxge_verbose = 1;
2875	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
2876		mxge_intr_coal_delay = 30;
2877	if (mxge_ticks == 0)
2878		mxge_ticks = hz;
2879	sc->pause = mxge_flow_control;
2880}
2881
2882static int
2883mxge_attach(device_t dev)
2884{
2885	mxge_softc_t *sc = device_get_softc(dev);
2886	struct ifnet *ifp;
2887	size_t bytes;
2888	int count, rid, err;
2889
2890	sc->dev = dev;
2891	mxge_fetch_tunables(sc);
2892
2893	err = bus_dma_tag_create(NULL,			/* parent */
2894				 1,			/* alignment */
2895				 4096,			/* boundary */
2896				 BUS_SPACE_MAXADDR,	/* low */
2897				 BUS_SPACE_MAXADDR,	/* high */
2898				 NULL, NULL,		/* filter */
2899				 65536 + 256,		/* maxsize */
2900				 MXGE_MAX_SEND_DESC, 	/* num segs */
2901				 4096,			/* maxsegsize */
2902				 0,			/* flags */
2903				 NULL, NULL,		/* lock */
2904				 &sc->parent_dmat);	/* tag */
2905
2906	if (err != 0) {
2907		device_printf(sc->dev, "Err %d allocating parent dmat\n",
2908			      err);
2909		goto abort_with_nothing;
2910	}
2911
2912	ifp = sc->ifp = if_alloc(IFT_ETHER);
2913	if (ifp == NULL) {
2914		device_printf(dev, "can not if_alloc()\n");
2915		err = ENOSPC;
2916		goto abort_with_parent_dmat;
2917	}
2918	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
2919		 device_get_nameunit(dev));
2920	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
2921	snprintf(sc->tx_mtx_name, sizeof(sc->tx_mtx_name), "%s:tx",
2922		 device_get_nameunit(dev));
2923	mtx_init(&sc->tx_mtx, sc->tx_mtx_name, NULL, MTX_DEF);
2924	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
2925		 "%s:drv", device_get_nameunit(dev));
2926	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
2927		 MTX_NETWORK_LOCK, MTX_DEF);
2928
2929	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
2930
2931	mxge_setup_cfg_space(sc);
2932
2933	/* Map the board into the kernel */
2934	rid = PCIR_BARS;
2935	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
2936					 ~0, 1, RF_ACTIVE);
2937	if (sc->mem_res == NULL) {
2938		device_printf(dev, "could not map memory\n");
2939		err = ENXIO;
2940		goto abort_with_lock;
2941	}
2942	sc->sram = rman_get_virtual(sc->mem_res);
2943	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
2944	if (sc->sram_size > rman_get_size(sc->mem_res)) {
2945		device_printf(dev, "impossible memory region size %ld\n",
2946			      rman_get_size(sc->mem_res));
2947		err = ENXIO;
2948		goto abort_with_mem_res;
2949	}
2950
2951	/* make NULL terminated copy of the EEPROM strings section of
2952	   lanai SRAM */
2953	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
2954	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
2955				rman_get_bushandle(sc->mem_res),
2956				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
2957				sc->eeprom_strings,
2958				MXGE_EEPROM_STRINGS_SIZE - 2);
2959	err = mxge_parse_strings(sc);
2960	if (err != 0)
2961		goto abort_with_mem_res;
2962
2963	/* Enable write combining for efficient use of PCIe bus */
2964	mxge_enable_wc(sc);
2965
2966	/* Allocate the out of band dma memory */
2967	err = mxge_dma_alloc(sc, &sc->cmd_dma,
2968			     sizeof (mxge_cmd_t), 64);
2969	if (err != 0)
2970		goto abort_with_mem_res;
2971	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
2972	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
2973	if (err != 0)
2974		goto abort_with_cmd_dma;
2975
2976	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
2977			     sizeof (*sc->fw_stats), 64);
2978	if (err != 0)
2979		goto abort_with_zeropad_dma;
2980	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
2981
2982	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
2983	if (err != 0)
2984		goto abort_with_fw_stats;
2985
2986	/* allocate interrupt queues */
2987	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);
2988	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
2989	if (err != 0)
2990		goto abort_with_dmabench;
2991	sc->rx_done.entry = sc->rx_done.dma.addr;
2992	bzero(sc->rx_done.entry, bytes);
2993
2994	/* Add our ithread  */
2995	count = pci_msi_count(dev);
2996	if (count == 1 && pci_alloc_msi(dev, &count) == 0) {
2997		rid = 1;
2998		sc->msi_enabled = 1;
2999	} else {
3000		rid = 0;
3001	}
3002	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
3003					 1, RF_SHAREABLE | RF_ACTIVE);
3004	if (sc->irq_res == NULL) {
3005		device_printf(dev, "could not alloc interrupt\n");
3006		goto abort_with_rx_done;
3007	}
3008	if (mxge_verbose)
3009		device_printf(dev, "using %s irq %ld\n",
3010			      sc->msi_enabled ? "MSI" : "INTx",
3011			      rman_get_start(sc->irq_res));
3012	/* load the firmware */
3013	mxge_select_firmware(sc);
3014
3015	err = mxge_load_firmware(sc);
3016	if (err != 0)
3017		goto abort_with_irq_res;
3018	sc->intr_coal_delay = mxge_intr_coal_delay;
3019	err = mxge_reset(sc);
3020	if (err != 0)
3021		goto abort_with_irq_res;
3022
3023	err = mxge_alloc_rings(sc);
3024	if (err != 0) {
3025		device_printf(sc->dev, "failed to allocate rings\n");
3026		goto abort_with_irq_res;
3027	}
3028
3029	err = bus_setup_intr(sc->dev, sc->irq_res,
3030			     INTR_TYPE_NET | INTR_MPSAFE,
3031			     NULL, mxge_intr, sc, &sc->ih);
3032	if (err != 0) {
3033		goto abort_with_rings;
3034	}
3035	/* hook into the network stack */
3036	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
3037	ifp->if_baudrate = 100000000;
3038	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
3039		IFCAP_JUMBO_MTU;
3040	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
3041	ifp->if_capenable = ifp->if_capabilities;
3042	sc->csum_flag = 1;
3043        ifp->if_init = mxge_init;
3044        ifp->if_softc = sc;
3045        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
3046        ifp->if_ioctl = mxge_ioctl;
3047        ifp->if_start = mxge_start;
3048	ether_ifattach(ifp, sc->mac_addr);
3049	/* ether_ifattach sets mtu to 1500 */
3050	ifp->if_mtu = MXGE_MAX_ETHER_MTU - ETHER_HDR_LEN;
3051
3052	/* Initialise the ifmedia structure */
3053	ifmedia_init(&sc->media, 0, mxge_media_change,
3054		     mxge_media_status);
3055	ifmedia_add(&sc->media, IFM_ETHER|IFM_AUTO, 0, NULL);
3056	mxge_add_sysctls(sc);
3057	return 0;
3058
3059abort_with_rings:
3060	mxge_free_rings(sc);
3061abort_with_irq_res:
3062	bus_release_resource(dev, SYS_RES_IRQ,
3063			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3064	if (sc->msi_enabled)
3065		pci_release_msi(dev);
3066abort_with_rx_done:
3067	sc->rx_done.entry = NULL;
3068	mxge_dma_free(&sc->rx_done.dma);
3069abort_with_dmabench:
3070	mxge_dma_free(&sc->dmabench_dma);
3071abort_with_fw_stats:
3072	mxge_dma_free(&sc->fw_stats_dma);
3073abort_with_zeropad_dma:
3074	mxge_dma_free(&sc->zeropad_dma);
3075abort_with_cmd_dma:
3076	mxge_dma_free(&sc->cmd_dma);
3077abort_with_mem_res:
3078	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3079abort_with_lock:
3080	pci_disable_busmaster(dev);
3081	mtx_destroy(&sc->cmd_mtx);
3082	mtx_destroy(&sc->tx_mtx);
3083	mtx_destroy(&sc->driver_mtx);
3084	if_free(ifp);
3085abort_with_parent_dmat:
3086	bus_dma_tag_destroy(sc->parent_dmat);
3087
3088abort_with_nothing:
3089	return err;
3090}
3091
3092static int
3093mxge_detach(device_t dev)
3094{
3095	mxge_softc_t *sc = device_get_softc(dev);
3096
3097	mtx_lock(&sc->driver_mtx);
3098	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
3099		mxge_close(sc);
3100	callout_stop(&sc->co_hdl);
3101	mtx_unlock(&sc->driver_mtx);
3102	ether_ifdetach(sc->ifp);
3103	ifmedia_removeall(&sc->media);
3104	mxge_dummy_rdma(sc, 0);
3105	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
3106	mxge_free_rings(sc);
3107	bus_release_resource(dev, SYS_RES_IRQ,
3108			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3109	if (sc->msi_enabled)
3110		pci_release_msi(dev);
3111
3112	sc->rx_done.entry = NULL;
3113	mxge_dma_free(&sc->rx_done.dma);
3114	mxge_dma_free(&sc->fw_stats_dma);
3115	mxge_dma_free(&sc->dmabench_dma);
3116	mxge_dma_free(&sc->zeropad_dma);
3117	mxge_dma_free(&sc->cmd_dma);
3118	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3119	pci_disable_busmaster(dev);
3120	mtx_destroy(&sc->cmd_mtx);
3121	mtx_destroy(&sc->tx_mtx);
3122	mtx_destroy(&sc->driver_mtx);
3123	if_free(sc->ifp);
3124	bus_dma_tag_destroy(sc->parent_dmat);
3125	return 0;
3126}
3127
3128static int
3129mxge_shutdown(device_t dev)
3130{
3131	return 0;
3132}
3133
3134/*
3135  This file uses Myri10GE driver indentation.
3136
3137  Local Variables:
3138  c-file-style:"linux"
3139  tab-width:8
3140  End:
3141*/
3142