if_mxge.c revision 166875
1/******************************************************************************
2
3Copyright (c) 2006, Myricom Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Redistributions in binary form must reproduce the above copyright
13    notice, this list of conditions and the following disclaimer in the
14    documentation and/or other materials provided with the distribution.
15
16 3. Neither the name of the Myricom Inc, nor the names of its
17    contributors may be used to endorse or promote products derived from
18    this software without specific prior written permission.
19
20THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30POSSIBILITY OF SUCH DAMAGE.
31
32***************************************************************************/
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: head/sys/dev/mxge/if_mxge.c 166875 2007-02-21 17:34:05Z gallatin $");
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/linker.h>
40#include <sys/firmware.h>
41#include <sys/endian.h>
42#include <sys/sockio.h>
43#include <sys/mbuf.h>
44#include <sys/malloc.h>
45#include <sys/kdb.h>
46#include <sys/kernel.h>
47#include <sys/module.h>
48#include <sys/memrange.h>
49#include <sys/socket.h>
50#include <sys/sysctl.h>
51#include <sys/sx.h>
52
53#include <net/if.h>
54#include <net/if_arp.h>
55#include <net/ethernet.h>
56#include <net/if_dl.h>
57#include <net/if_media.h>
58
59#include <net/bpf.h>
60
61#include <net/if_types.h>
62#include <net/if_vlan_var.h>
63#include <net/zlib.h>
64
65#include <netinet/in_systm.h>
66#include <netinet/in.h>
67#include <netinet/ip.h>
68#include <netinet/tcp.h>
69
70#include <machine/bus.h>
71#include <machine/resource.h>
72#include <sys/bus.h>
73#include <sys/rman.h>
74
75#include <dev/pci/pcireg.h>
76#include <dev/pci/pcivar.h>
77
78#include <vm/vm.h>		/* for pmap_mapdev() */
79#include <vm/pmap.h>
80
81#include <dev/mxge/mxge_mcp.h>
82#include <dev/mxge/mcp_gen_header.h>
83#include <dev/mxge/if_mxge_var.h>
84
85/* tunable params */
86static int mxge_nvidia_ecrc_enable = 1;
87static int mxge_force_firmware = 0;
88static int mxge_max_intr_slots = 1024;
89static int mxge_intr_coal_delay = 30;
90static int mxge_deassert_wait = 1;
91static int mxge_flow_control = 1;
92static int mxge_verbose = 0;
93static int mxge_ticks;
94static char *mxge_fw_unaligned = "mxge_ethp_z8e";
95static char *mxge_fw_aligned = "mxge_eth_z8e";
96
97static int mxge_probe(device_t dev);
98static int mxge_attach(device_t dev);
99static int mxge_detach(device_t dev);
100static int mxge_shutdown(device_t dev);
101static void mxge_intr(void *arg);
102
103static device_method_t mxge_methods[] =
104{
105  /* Device interface */
106  DEVMETHOD(device_probe, mxge_probe),
107  DEVMETHOD(device_attach, mxge_attach),
108  DEVMETHOD(device_detach, mxge_detach),
109  DEVMETHOD(device_shutdown, mxge_shutdown),
110  {0, 0}
111};
112
113static driver_t mxge_driver =
114{
115  "mxge",
116  mxge_methods,
117  sizeof(mxge_softc_t),
118};
119
120static devclass_t mxge_devclass;
121
122/* Declare ourselves to be a child of the PCI bus.*/
123DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
124MODULE_DEPEND(mxge, firmware, 1, 1, 1);
125
126static int
127mxge_probe(device_t dev)
128{
129  if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
130      (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
131	  device_set_desc(dev, "Myri10G-PCIE-8A");
132	  return 0;
133  }
134  return ENXIO;
135}
136
137static void
138mxge_enable_wc(mxge_softc_t *sc)
139{
140	struct mem_range_desc mrdesc;
141	vm_paddr_t pa;
142	vm_offset_t len;
143	int err, action;
144
145	pa = rman_get_start(sc->mem_res);
146	len = rman_get_size(sc->mem_res);
147	mrdesc.mr_base = pa;
148	mrdesc.mr_len = len;
149	mrdesc.mr_flags = MDF_WRITECOMBINE;
150	action = MEMRANGE_SET_UPDATE;
151	strcpy((char *)&mrdesc.mr_owner, "mxge");
152	err = mem_range_attr_set(&mrdesc, &action);
153	if (err != 0) {
154		device_printf(sc->dev,
155			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
156			      (unsigned long)pa, (unsigned long)len, err);
157	} else {
158		sc->wc = 1;
159	}
160}
161
162
163/* callback to get our DMA address */
164static void
165mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
166			 int error)
167{
168	if (error == 0) {
169		*(bus_addr_t *) arg = segs->ds_addr;
170	}
171}
172
173static int
174mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
175		   bus_size_t alignment)
176{
177	int err;
178	device_t dev = sc->dev;
179
180	/* allocate DMAable memory tags */
181	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
182				 alignment,		/* alignment */
183				 4096,			/* boundary */
184				 BUS_SPACE_MAXADDR,	/* low */
185				 BUS_SPACE_MAXADDR,	/* high */
186				 NULL, NULL,		/* filter */
187				 bytes,			/* maxsize */
188				 1,			/* num segs */
189				 4096,			/* maxsegsize */
190				 BUS_DMA_COHERENT,	/* flags */
191				 NULL, NULL,		/* lock */
192				 &dma->dmat);		/* tag */
193	if (err != 0) {
194		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
195		return err;
196	}
197
198	/* allocate DMAable memory & map */
199	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
200			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
201				| BUS_DMA_ZERO),  &dma->map);
202	if (err != 0) {
203		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
204		goto abort_with_dmat;
205	}
206
207	/* load the memory */
208	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
209			      mxge_dmamap_callback,
210			      (void *)&dma->bus_addr, 0);
211	if (err != 0) {
212		device_printf(dev, "couldn't load map (err = %d)\n", err);
213		goto abort_with_mem;
214	}
215	return 0;
216
217abort_with_mem:
218	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
219abort_with_dmat:
220	(void)bus_dma_tag_destroy(dma->dmat);
221	return err;
222}
223
224
225static void
226mxge_dma_free(mxge_dma_t *dma)
227{
228	bus_dmamap_unload(dma->dmat, dma->map);
229	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
230	(void)bus_dma_tag_destroy(dma->dmat);
231}
232
233/*
234 * The eeprom strings on the lanaiX have the format
235 * SN=x\0
236 * MAC=x:x:x:x:x:x\0
237 * PC=text\0
238 */
239
240static int
241mxge_parse_strings(mxge_softc_t *sc)
242{
243#define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
244
245	char *ptr, *limit;
246	int i, found_mac;
247
248	ptr = sc->eeprom_strings;
249	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
250	found_mac = 0;
251	while (ptr < limit && *ptr != '\0') {
252		if (memcmp(ptr, "MAC=", 4) == 0) {
253			ptr += 1;
254			sc->mac_addr_string = ptr;
255			for (i = 0; i < 6; i++) {
256				ptr += 3;
257				if ((ptr + 2) > limit)
258					goto abort;
259				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
260				found_mac = 1;
261			}
262		} else if (memcmp(ptr, "PC=", 3) == 0) {
263			ptr += 3;
264			strncpy(sc->product_code_string, ptr,
265				sizeof (sc->product_code_string) - 1);
266		} else if (memcmp(ptr, "SN=", 3) == 0) {
267			ptr += 3;
268			strncpy(sc->serial_number_string, ptr,
269				sizeof (sc->serial_number_string) - 1);
270		}
271		MXGE_NEXT_STRING(ptr);
272	}
273
274	if (found_mac)
275		return 0;
276
277 abort:
278	device_printf(sc->dev, "failed to parse eeprom_strings\n");
279
280	return ENXIO;
281}
282
283#if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
284static int
285mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
286{
287	uint32_t val;
288	unsigned long off;
289	char *va, *cfgptr;
290	uint16_t vendor_id, device_id;
291	uintptr_t bus, slot, func, ivend, idev;
292	uint32_t *ptr32;
293
294	/* XXXX
295	   Test below is commented because it is believed that doing
296	   config read/write beyond 0xff will access the config space
297	   for the next larger function.  Uncomment this and remove
298	   the hacky pmap_mapdev() way of accessing config space when
299	   FreeBSD grows support for extended pcie config space access
300	*/
301#if 0
302	/* See if we can, by some miracle, access the extended
303	   config space */
304	val = pci_read_config(pdev, 0x178, 4);
305	if (val != 0xffffffff) {
306		val |= 0x40;
307		pci_write_config(pdev, 0x178, val, 4);
308		return 0;
309	}
310#endif
311	/* Rather than using normal pci config space writes, we must
312	 * map the Nvidia config space ourselves.  This is because on
313	 * opteron/nvidia class machine the 0xe000000 mapping is
314	 * handled by the nvidia chipset, that means the internal PCI
315	 * device (the on-chip northbridge), or the amd-8131 bridge
316	 * and things behind them are not visible by this method.
317	 */
318
319	BUS_READ_IVAR(device_get_parent(pdev), pdev,
320		      PCI_IVAR_BUS, &bus);
321	BUS_READ_IVAR(device_get_parent(pdev), pdev,
322		      PCI_IVAR_SLOT, &slot);
323	BUS_READ_IVAR(device_get_parent(pdev), pdev,
324		      PCI_IVAR_FUNCTION, &func);
325	BUS_READ_IVAR(device_get_parent(pdev), pdev,
326		      PCI_IVAR_VENDOR, &ivend);
327	BUS_READ_IVAR(device_get_parent(pdev), pdev,
328		      PCI_IVAR_DEVICE, &idev);
329
330	off =  0xe0000000UL
331		+ 0x00100000UL * (unsigned long)bus
332		+ 0x00001000UL * (unsigned long)(func
333						 + 8 * slot);
334
335	/* map it into the kernel */
336	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
337
338
339	if (va == NULL) {
340		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
341		return EIO;
342	}
343	/* get a pointer to the config space mapped into the kernel */
344	cfgptr = va + (off & PAGE_MASK);
345
346	/* make sure that we can really access it */
347	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
348	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
349	if (! (vendor_id == ivend && device_id == idev)) {
350		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
351			      vendor_id, device_id);
352		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
353		return EIO;
354	}
355
356	ptr32 = (uint32_t*)(cfgptr + 0x178);
357	val = *ptr32;
358
359	if (val == 0xffffffff) {
360		device_printf(sc->dev, "extended mapping failed\n");
361		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
362		return EIO;
363	}
364	*ptr32 = val | 0x40;
365	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
366	if (mxge_verbose)
367		device_printf(sc->dev,
368			      "Enabled ECRC on upstream Nvidia bridge "
369			      "at %d:%d:%d\n",
370			      (int)bus, (int)slot, (int)func);
371	return 0;
372}
373#else
374static int
375mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
376{
377	device_printf(sc->dev,
378		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
379	return ENXIO;
380}
381#endif
382/*
383 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
384 * when the PCI-E Completion packets are aligned on an 8-byte
385 * boundary.  Some PCI-E chip sets always align Completion packets; on
386 * the ones that do not, the alignment can be enforced by enabling
387 * ECRC generation (if supported).
388 *
389 * When PCI-E Completion packets are not aligned, it is actually more
390 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
391 *
392 * If the driver can neither enable ECRC nor verify that it has
393 * already been enabled, then it must use a firmware image which works
394 * around unaligned completion packets (ethp_z8e.dat), and it should
395 * also ensure that it never gives the device a Read-DMA which is
396 * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
397 * enabled, then the driver should use the aligned (eth_z8e.dat)
398 * firmware image, and set tx.boundary to 4KB.
399 */
400
401static void
402mxge_select_firmware(mxge_softc_t *sc)
403{
404	int err, aligned = 0;
405	device_t pdev;
406	uint16_t pvend, pdid;
407
408
409	if (mxge_force_firmware != 0) {
410		if (mxge_force_firmware == 1)
411			aligned = 1;
412		else
413			aligned = 0;
414		if (mxge_verbose)
415			device_printf(sc->dev,
416				      "Assuming %s completions (forced)\n",
417				      aligned ? "aligned" : "unaligned");
418		goto abort;
419	}
420
421	/* if the PCIe link width is 4 or less, we can use the aligned
422	   firmware and skip any checks */
423	if (sc->link_width != 0 && sc->link_width <= 4) {
424		device_printf(sc->dev,
425			      "PCIe x%d Link, expect reduced performance\n",
426			      sc->link_width);
427		aligned = 1;
428		goto abort;
429	}
430
431	pdev = device_get_parent(device_get_parent(sc->dev));
432	if (pdev == NULL) {
433		device_printf(sc->dev, "could not find parent?\n");
434		goto abort;
435	}
436	pvend = pci_read_config(pdev, PCIR_VENDOR, 2);
437	pdid = pci_read_config(pdev, PCIR_DEVICE, 2);
438
439	/* see if we can enable ECRC's on an upstream
440	   Nvidia bridge */
441	if (mxge_nvidia_ecrc_enable &&
442	    (pvend == 0x10de && pdid == 0x005d)) {
443		err = mxge_enable_nvidia_ecrc(sc, pdev);
444		if (err == 0) {
445			aligned = 1;
446			if (mxge_verbose)
447				device_printf(sc->dev,
448					      "Assuming aligned completions"
449					      " (ECRC)\n");
450		}
451	}
452	/* see if the upstream bridge is known to
453	   provided aligned completions */
454	if (/* HT2000 */ (pvend == 0x1166 && pdid == 0x0132) ||
455	    /* PLX */    (pvend == 0x10b5 && pdid == 0x8532) ||
456	    /* Intel */  (pvend == 0x8086 &&
457	      /* E5000 NorthBridge*/((pdid >= 0x25f7 && pdid <= 0x25fa) ||
458	      /* E5000 SouthBridge*/ (pdid >= 0x3510 && pdid <= 0x351b)))) {
459		aligned = 1;
460		if (mxge_verbose)
461			device_printf(sc->dev,
462				      "Assuming aligned completions "
463				      "(0x%x:0x%x)\n", pvend, pdid);
464	}
465
466abort:
467	if (aligned) {
468		sc->fw_name = mxge_fw_aligned;
469		sc->tx.boundary = 4096;
470	} else {
471		sc->fw_name = mxge_fw_unaligned;
472		sc->tx.boundary = 2048;
473	}
474}
475
476union qualhack
477{
478        const char *ro_char;
479        char *rw_char;
480};
481
482static int
483mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
484{
485
486
487	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
488		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
489			      be32toh(hdr->mcp_type));
490		return EIO;
491	}
492
493	/* save firmware version for sysctl */
494	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
495	if (mxge_verbose)
496		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
497
498	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
499	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
500
501	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
502	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
503		device_printf(sc->dev, "Found firmware version %s\n",
504			      sc->fw_version);
505		device_printf(sc->dev, "Driver needs %d.%d\n",
506			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
507		return EINVAL;
508	}
509	return 0;
510
511}
512
513static int
514mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
515{
516	const struct firmware *fw;
517	const mcp_gen_header_t *hdr;
518	unsigned hdr_offset;
519	const char *fw_data;
520	union qualhack hack;
521	int status;
522	unsigned int i;
523	char dummy;
524
525
526	fw = firmware_get(sc->fw_name);
527
528	if (fw == NULL) {
529		device_printf(sc->dev, "Could not find firmware image %s\n",
530			      sc->fw_name);
531		return ENOENT;
532	}
533	if (fw->datasize > *limit ||
534	    fw->datasize < MCP_HEADER_PTR_OFFSET + 4) {
535		device_printf(sc->dev, "Firmware image %s too large (%d/%d)\n",
536			      sc->fw_name, (int)fw->datasize, (int) *limit);
537		status = ENOSPC;
538		goto abort_with_fw;
539	}
540	*limit = fw->datasize;
541
542	/* check id */
543	fw_data = (const char *)fw->data;
544	hdr_offset = htobe32(*(const uint32_t *)
545			     (fw_data + MCP_HEADER_PTR_OFFSET));
546	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->datasize) {
547		device_printf(sc->dev, "Bad firmware file");
548		status = EIO;
549		goto abort_with_fw;
550	}
551	hdr = (const void*)(fw_data + hdr_offset);
552
553	status = mxge_validate_firmware(sc, hdr);
554	if (status != 0)
555		goto abort_with_fw;
556
557	hack.ro_char = fw_data;
558	/* Copy the inflated firmware to NIC SRAM. */
559	for (i = 0; i < *limit; i += 256) {
560		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
561			      hack.rw_char + i,
562			      min(256U, (unsigned)(*limit - i)));
563		mb();
564		dummy = *sc->sram;
565		mb();
566	}
567
568	status = 0;
569abort_with_fw:
570	firmware_put(fw, FIRMWARE_UNLOAD);
571	return status;
572}
573
574/*
575 * Enable or disable periodic RDMAs from the host to make certain
576 * chipsets resend dropped PCIe messages
577 */
578
579static void
580mxge_dummy_rdma(mxge_softc_t *sc, int enable)
581{
582	char buf_bytes[72];
583	volatile uint32_t *confirm;
584	volatile char *submit;
585	uint32_t *buf, dma_low, dma_high;
586	int i;
587
588	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
589
590	/* clear confirmation addr */
591	confirm = (volatile uint32_t *)sc->cmd;
592	*confirm = 0;
593	mb();
594
595	/* send an rdma command to the PCIe engine, and wait for the
596	   response in the confirmation address.  The firmware should
597	   write a -1 there to indicate it is alive and well
598	*/
599
600	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
601	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
602	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
603	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
604	buf[2] = htobe32(0xffffffff);		/* confirm data */
605	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
606	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
607	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
608	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
609	buf[5] = htobe32(enable);			/* enable? */
610
611
612	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
613
614	mxge_pio_copy(submit, buf, 64);
615	mb();
616	DELAY(1000);
617	mb();
618	i = 0;
619	while (*confirm != 0xffffffff && i < 20) {
620		DELAY(1000);
621		i++;
622	}
623	if (*confirm != 0xffffffff) {
624		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
625			      (enable ? "enable" : "disable"), confirm,
626			      *confirm);
627	}
628	return;
629}
630
631static int
632mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
633{
634	mcp_cmd_t *buf;
635	char buf_bytes[sizeof(*buf) + 8];
636	volatile mcp_cmd_response_t *response = sc->cmd;
637	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
638	uint32_t dma_low, dma_high;
639	int sleep_total = 0;
640
641	/* ensure buf is aligned to 8 bytes */
642	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
643
644	buf->data0 = htobe32(data->data0);
645	buf->data1 = htobe32(data->data1);
646	buf->data2 = htobe32(data->data2);
647	buf->cmd = htobe32(cmd);
648	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
649	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
650
651	buf->response_addr.low = htobe32(dma_low);
652	buf->response_addr.high = htobe32(dma_high);
653	mtx_lock(&sc->cmd_mtx);
654	response->result = 0xffffffff;
655	mb();
656	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
657
658	/* wait up to 20ms */
659	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
660		bus_dmamap_sync(sc->cmd_dma.dmat,
661				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
662		mb();
663		if (response->result != 0xffffffff) {
664			if (response->result == 0) {
665				data->data0 = be32toh(response->data);
666				mtx_unlock(&sc->cmd_mtx);
667				return 0;
668			} else {
669				device_printf(sc->dev,
670					      "mxge: command %d "
671					      "failed, result = %d\n",
672					      cmd, be32toh(response->result));
673				mtx_unlock(&sc->cmd_mtx);
674				return ENXIO;
675			}
676		}
677		DELAY(1000);
678	}
679	mtx_unlock(&sc->cmd_mtx);
680	device_printf(sc->dev, "mxge: command %d timed out"
681		      "result = %d\n",
682		      cmd, be32toh(response->result));
683	return EAGAIN;
684}
685
686static int
687mxge_adopt_running_firmware(mxge_softc_t *sc)
688{
689	struct mcp_gen_header *hdr;
690	const size_t bytes = sizeof (struct mcp_gen_header);
691	size_t hdr_offset;
692	int status;
693
694	/* find running firmware header */
695	hdr_offset = htobe32(*(volatile uint32_t *)
696			     (sc->sram + MCP_HEADER_PTR_OFFSET));
697
698	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
699		device_printf(sc->dev,
700			      "Running firmware has bad header offset (%d)\n",
701			      (int)hdr_offset);
702		return EIO;
703	}
704
705	/* copy header of running firmware from SRAM to host memory to
706	 * validate firmware */
707	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
708	if (hdr == NULL) {
709		device_printf(sc->dev, "could not malloc firmware hdr\n");
710		return ENOMEM;
711	}
712	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
713				rman_get_bushandle(sc->mem_res),
714				hdr_offset, (char *)hdr, bytes);
715	status = mxge_validate_firmware(sc, hdr);
716	free(hdr, M_DEVBUF);
717
718	/*
719	 * check to see if adopted firmware has bug where adopting
720	 * it will cause broadcasts to be filtered unless the NIC
721	 * is kept in ALLMULTI mode
722	 */
723	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
724	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
725		sc->adopted_rx_filter_bug = 1;
726		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
727			      "working around rx filter bug\n",
728			      sc->fw_ver_major, sc->fw_ver_minor,
729			      sc->fw_ver_tiny);
730	}
731
732	return status;
733}
734
735
736static int
737mxge_load_firmware(mxge_softc_t *sc)
738{
739	volatile uint32_t *confirm;
740	volatile char *submit;
741	char buf_bytes[72];
742	uint32_t *buf, size, dma_low, dma_high;
743	int status, i;
744
745	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
746
747	size = sc->sram_size;
748	status = mxge_load_firmware_helper(sc, &size);
749	if (status) {
750		/* Try to use the currently running firmware, if
751		   it is new enough */
752		status = mxge_adopt_running_firmware(sc);
753		if (status) {
754			device_printf(sc->dev,
755				      "failed to adopt running firmware\n");
756			return status;
757		}
758		device_printf(sc->dev,
759			      "Successfully adopted running firmware\n");
760		if (sc->tx.boundary == 4096) {
761			device_printf(sc->dev,
762				"Using firmware currently running on NIC"
763				 ".  For optimal\n");
764			device_printf(sc->dev,
765				 "performance consider loading optimized "
766				 "firmware\n");
767		}
768		sc->fw_name = mxge_fw_unaligned;
769		sc->tx.boundary = 2048;
770		return 0;
771	}
772	/* clear confirmation addr */
773	confirm = (volatile uint32_t *)sc->cmd;
774	*confirm = 0;
775	mb();
776	/* send a reload command to the bootstrap MCP, and wait for the
777	   response in the confirmation address.  The firmware should
778	   write a -1 there to indicate it is alive and well
779	*/
780
781	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
782	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
783
784	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
785	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
786	buf[2] = htobe32(0xffffffff);	/* confirm data */
787
788	/* FIX: All newest firmware should un-protect the bottom of
789	   the sram before handoff. However, the very first interfaces
790	   do not. Therefore the handoff copy must skip the first 8 bytes
791	*/
792					/* where the code starts*/
793	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
794	buf[4] = htobe32(size - 8); 	/* length of code */
795	buf[5] = htobe32(8);		/* where to copy to */
796	buf[6] = htobe32(0);		/* where to jump to */
797
798	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
799	mxge_pio_copy(submit, buf, 64);
800	mb();
801	DELAY(1000);
802	mb();
803	i = 0;
804	while (*confirm != 0xffffffff && i < 20) {
805		DELAY(1000*10);
806		i++;
807		bus_dmamap_sync(sc->cmd_dma.dmat,
808				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
809	}
810	if (*confirm != 0xffffffff) {
811		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
812			confirm, *confirm);
813
814		return ENXIO;
815	}
816	return 0;
817}
818
819static int
820mxge_update_mac_address(mxge_softc_t *sc)
821{
822	mxge_cmd_t cmd;
823	uint8_t *addr = sc->mac_addr;
824	int status;
825
826
827	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
828		     | (addr[2] << 8) | addr[3]);
829
830	cmd.data1 = ((addr[4] << 8) | (addr[5]));
831
832	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
833	return status;
834}
835
836static int
837mxge_change_pause(mxge_softc_t *sc, int pause)
838{
839	mxge_cmd_t cmd;
840	int status;
841
842	if (pause)
843		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
844				       &cmd);
845	else
846		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
847				       &cmd);
848
849	if (status) {
850		device_printf(sc->dev, "Failed to set flow control mode\n");
851		return ENXIO;
852	}
853	sc->pause = pause;
854	return 0;
855}
856
857static void
858mxge_change_promisc(mxge_softc_t *sc, int promisc)
859{
860	mxge_cmd_t cmd;
861	int status;
862
863	if (promisc)
864		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
865				       &cmd);
866	else
867		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
868				       &cmd);
869
870	if (status) {
871		device_printf(sc->dev, "Failed to set promisc mode\n");
872	}
873}
874
875static void
876mxge_set_multicast_list(mxge_softc_t *sc)
877{
878	mxge_cmd_t cmd;
879	struct ifmultiaddr *ifma;
880	struct ifnet *ifp = sc->ifp;
881	int err;
882
883	/* This firmware is known to not support multicast */
884	if (!sc->fw_multicast_support)
885		return;
886
887	/* Disable multicast filtering while we play with the lists*/
888	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
889	if (err != 0) {
890		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
891		       " error status: %d\n", err);
892		return;
893	}
894
895	if (sc->adopted_rx_filter_bug)
896		return;
897
898	if (ifp->if_flags & IFF_ALLMULTI)
899		/* request to disable multicast filtering, so quit here */
900		return;
901
902	/* Flush all the filters */
903
904	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
905	if (err != 0) {
906		device_printf(sc->dev,
907			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
908			      ", error status: %d\n", err);
909		return;
910	}
911
912	/* Walk the multicast list, and add each address */
913
914	IF_ADDR_LOCK(ifp);
915	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
916		if (ifma->ifma_addr->sa_family != AF_LINK)
917			continue;
918		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
919		      &cmd.data0, 4);
920		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
921		      &cmd.data1, 2);
922		cmd.data0 = htonl(cmd.data0);
923		cmd.data1 = htonl(cmd.data1);
924		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
925		if (err != 0) {
926			device_printf(sc->dev, "Failed "
927			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
928			       "%d\t", err);
929			/* abort, leaving multicast filtering off */
930			IF_ADDR_UNLOCK(ifp);
931			return;
932		}
933	}
934	IF_ADDR_UNLOCK(ifp);
935	/* Enable multicast filtering */
936	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
937	if (err != 0) {
938		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
939		       ", error status: %d\n", err);
940	}
941}
942
943
944static int
945mxge_reset(mxge_softc_t *sc)
946{
947
948	mxge_cmd_t cmd;
949	size_t bytes;
950	int status;
951
952	/* try to send a reset command to the card to see if it
953	   is alive */
954	memset(&cmd, 0, sizeof (cmd));
955	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
956	if (status != 0) {
957		device_printf(sc->dev, "failed reset\n");
958		return ENXIO;
959	}
960
961	mxge_dummy_rdma(sc, 1);
962
963	/* Now exchange information about interrupts  */
964	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);\
965	memset(sc->rx_done.entry, 0, bytes);
966	cmd.data0 = (uint32_t)bytes;
967	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
968	cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
969	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
970	status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
971
972	status |= mxge_send_cmd(sc,
973				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
974
975
976	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
977
978	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
979	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
980
981
982	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
983				&cmd);
984	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
985	if (status != 0) {
986		device_printf(sc->dev, "failed set interrupt parameters\n");
987		return status;
988	}
989
990
991	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
992
993
994	/* run a DMA benchmark */
995	sc->read_dma = sc->write_dma = sc->read_write_dma = 0;
996
997	/* Read DMA */
998	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
999	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
1000	cmd.data2 = sc->tx.boundary * 0x10000;
1001
1002	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
1003	if (status != 0)
1004		device_printf(sc->dev, "read dma benchmark failed\n");
1005	else
1006		sc->read_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
1007			(cmd.data0 & 0xffff);
1008
1009	/* Write DMA */
1010	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
1011	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
1012	cmd.data2 = sc->tx.boundary * 0x1;
1013	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
1014	if (status != 0)
1015		device_printf(sc->dev, "write dma benchmark failed\n");
1016	else
1017		sc->write_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
1018			(cmd.data0 & 0xffff);
1019	/* Read/Write DMA */
1020	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
1021	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
1022	cmd.data2 = sc->tx.boundary * 0x10001;
1023	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
1024	if (status != 0)
1025		device_printf(sc->dev, "read/write dma benchmark failed\n");
1026	else
1027		sc->read_write_dma =
1028			((cmd.data0>>16) * sc->tx.boundary * 2 * 2) /
1029			(cmd.data0 & 0xffff);
1030
1031	/* reset mcp/driver shared state back to 0 */
1032	bzero(sc->rx_done.entry, bytes);
1033	sc->rx_done.idx = 0;
1034	sc->rx_done.cnt = 0;
1035	sc->tx.req = 0;
1036	sc->tx.done = 0;
1037	sc->tx.pkt_done = 0;
1038	sc->tx.wake = 0;
1039	sc->tx.stall = 0;
1040	sc->rx_big.cnt = 0;
1041	sc->rx_small.cnt = 0;
1042	sc->rdma_tags_available = 15;
1043	sc->fw_stats->valid = 0;
1044	sc->fw_stats->send_done_count = 0;
1045	status = mxge_update_mac_address(sc);
1046	mxge_change_promisc(sc, 0);
1047	mxge_change_pause(sc, sc->pause);
1048	mxge_set_multicast_list(sc);
1049	return status;
1050}
1051
1052static int
1053mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1054{
1055        mxge_softc_t *sc;
1056        unsigned int intr_coal_delay;
1057        int err;
1058
1059        sc = arg1;
1060        intr_coal_delay = sc->intr_coal_delay;
1061        err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1062        if (err != 0) {
1063                return err;
1064        }
1065        if (intr_coal_delay == sc->intr_coal_delay)
1066                return 0;
1067
1068        if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1069                return EINVAL;
1070
1071	mtx_lock(&sc->driver_mtx);
1072	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1073	sc->intr_coal_delay = intr_coal_delay;
1074
1075	mtx_unlock(&sc->driver_mtx);
1076        return err;
1077}
1078
1079static int
1080mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1081{
1082        mxge_softc_t *sc;
1083        unsigned int enabled;
1084        int err;
1085
1086        sc = arg1;
1087        enabled = sc->pause;
1088        err = sysctl_handle_int(oidp, &enabled, arg2, req);
1089        if (err != 0) {
1090                return err;
1091        }
1092        if (enabled == sc->pause)
1093                return 0;
1094
1095	mtx_lock(&sc->driver_mtx);
1096	err = mxge_change_pause(sc, enabled);
1097	mtx_unlock(&sc->driver_mtx);
1098        return err;
1099}
1100
1101static int
1102mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1103{
1104        int err;
1105
1106        if (arg1 == NULL)
1107                return EFAULT;
1108        arg2 = be32toh(*(int *)arg1);
1109        arg1 = NULL;
1110        err = sysctl_handle_int(oidp, arg1, arg2, req);
1111
1112        return err;
1113}
1114
1115static void
1116mxge_add_sysctls(mxge_softc_t *sc)
1117{
1118	struct sysctl_ctx_list *ctx;
1119	struct sysctl_oid_list *children;
1120	mcp_irq_data_t *fw;
1121
1122	ctx = device_get_sysctl_ctx(sc->dev);
1123	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1124	fw = sc->fw_stats;
1125
1126	/* random information */
1127	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1128		       "firmware_version",
1129		       CTLFLAG_RD, &sc->fw_version,
1130		       0, "firmware version");
1131	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1132		       "serial_number",
1133		       CTLFLAG_RD, &sc->serial_number_string,
1134		       0, "serial number");
1135	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1136		       "product_code",
1137		       CTLFLAG_RD, &sc->product_code_string,
1138		       0, "product_code");
1139	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1140		       "pcie_link_width",
1141		       CTLFLAG_RD, &sc->link_width,
1142		       0, "tx_boundary");
1143	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1144		       "tx_boundary",
1145		       CTLFLAG_RD, &sc->tx.boundary,
1146		       0, "tx_boundary");
1147	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1148		       "write_combine",
1149		       CTLFLAG_RD, &sc->wc,
1150		       0, "write combining PIO?");
1151	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1152		       "read_dma_MBs",
1153		       CTLFLAG_RD, &sc->read_dma,
1154		       0, "DMA Read speed in MB/s");
1155	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1156		       "write_dma_MBs",
1157		       CTLFLAG_RD, &sc->write_dma,
1158		       0, "DMA Write speed in MB/s");
1159	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1160		       "read_write_dma_MBs",
1161		       CTLFLAG_RD, &sc->read_write_dma,
1162		       0, "DMA concurrent Read/Write speed in MB/s");
1163
1164
1165	/* performance related tunables */
1166	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1167			"intr_coal_delay",
1168			CTLTYPE_INT|CTLFLAG_RW, sc,
1169			0, mxge_change_intr_coal,
1170			"I", "interrupt coalescing delay in usecs");
1171
1172	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1173			"flow_control_enabled",
1174			CTLTYPE_INT|CTLFLAG_RW, sc,
1175			0, mxge_change_flow_control,
1176			"I", "interrupt coalescing delay in usecs");
1177
1178	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1179		       "deassert_wait",
1180		       CTLFLAG_RW, &mxge_deassert_wait,
1181		       0, "Wait for IRQ line to go low in ihandler");
1182
1183	/* stats block from firmware is in network byte order.
1184	   Need to swap it */
1185	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1186			"link_up",
1187			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1188			0, mxge_handle_be32,
1189			"I", "link up");
1190	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1191			"rdma_tags_available",
1192			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1193			0, mxge_handle_be32,
1194			"I", "rdma_tags_available");
1195	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1196			"dropped_link_overflow",
1197			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1198			0, mxge_handle_be32,
1199			"I", "dropped_link_overflow");
1200	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1201			"dropped_link_error_or_filtered",
1202			CTLTYPE_INT|CTLFLAG_RD,
1203			&fw->dropped_link_error_or_filtered,
1204			0, mxge_handle_be32,
1205			"I", "dropped_link_error_or_filtered");
1206	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1207			"dropped_multicast_filtered",
1208			CTLTYPE_INT|CTLFLAG_RD,
1209			&fw->dropped_multicast_filtered,
1210			0, mxge_handle_be32,
1211			"I", "dropped_multicast_filtered");
1212	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1213			"dropped_runt",
1214			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1215			0, mxge_handle_be32,
1216			"I", "dropped_runt");
1217	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1218			"dropped_overrun",
1219			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1220			0, mxge_handle_be32,
1221			"I", "dropped_overrun");
1222	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1223			"dropped_no_small_buffer",
1224			CTLTYPE_INT|CTLFLAG_RD,
1225			&fw->dropped_no_small_buffer,
1226			0, mxge_handle_be32,
1227			"I", "dropped_no_small_buffer");
1228	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1229			"dropped_no_big_buffer",
1230			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1231			0, mxge_handle_be32,
1232			"I", "dropped_no_big_buffer");
1233
1234	/* host counters exported for debugging */
1235	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1236		       "rx_small_cnt",
1237		       CTLFLAG_RD, &sc->rx_small.cnt,
1238		       0, "rx_small_cnt");
1239	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1240		       "rx_big_cnt",
1241		       CTLFLAG_RD, &sc->rx_big.cnt,
1242		       0, "rx_small_cnt");
1243	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1244		       "tx_req",
1245		       CTLFLAG_RD, &sc->tx.req,
1246		       0, "tx_req");
1247	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1248		       "tx_done",
1249		       CTLFLAG_RD, &sc->tx.done,
1250		       0, "tx_done");
1251	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1252		       "tx_pkt_done",
1253		       CTLFLAG_RD, &sc->tx.pkt_done,
1254		       0, "tx_done");
1255	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1256		       "tx_stall",
1257		       CTLFLAG_RD, &sc->tx.stall,
1258		       0, "tx_stall");
1259	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1260		       "tx_wake",
1261		       CTLFLAG_RD, &sc->tx.wake,
1262		       0, "tx_wake");
1263
1264	/* verbose printing? */
1265	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1266		       "verbose",
1267		       CTLFLAG_RW, &mxge_verbose,
1268		       0, "verbose printing");
1269
1270}
1271
1272/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1273   backwards one at a time and handle ring wraps */
1274
1275static inline void
1276mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1277			    mcp_kreq_ether_send_t *src, int cnt)
1278{
1279        int idx, starting_slot;
1280        starting_slot = tx->req;
1281        while (cnt > 1) {
1282                cnt--;
1283                idx = (starting_slot + cnt) & tx->mask;
1284                mxge_pio_copy(&tx->lanai[idx],
1285			      &src[cnt], sizeof(*src));
1286                mb();
1287        }
1288}
1289
1290/*
1291 * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1292 * at most 32 bytes at a time, so as to avoid involving the software
1293 * pio handler in the nic.   We re-write the first segment's flags
1294 * to mark them valid only after writing the entire chain
1295 */
1296
1297static inline void
1298mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1299                  int cnt)
1300{
1301        int idx, i;
1302        uint32_t *src_ints;
1303	volatile uint32_t *dst_ints;
1304        mcp_kreq_ether_send_t *srcp;
1305	volatile mcp_kreq_ether_send_t *dstp, *dst;
1306	uint8_t last_flags;
1307
1308        idx = tx->req & tx->mask;
1309
1310	last_flags = src->flags;
1311	src->flags = 0;
1312        mb();
1313        dst = dstp = &tx->lanai[idx];
1314        srcp = src;
1315
1316        if ((idx + cnt) < tx->mask) {
1317                for (i = 0; i < (cnt - 1); i += 2) {
1318                        mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1319                        mb(); /* force write every 32 bytes */
1320                        srcp += 2;
1321                        dstp += 2;
1322                }
1323        } else {
1324                /* submit all but the first request, and ensure
1325                   that it is submitted below */
1326                mxge_submit_req_backwards(tx, src, cnt);
1327                i = 0;
1328        }
1329        if (i < cnt) {
1330                /* submit the first request */
1331                mxge_pio_copy(dstp, srcp, sizeof(*src));
1332                mb(); /* barrier before setting valid flag */
1333        }
1334
1335        /* re-write the last 32-bits with the valid flags */
1336        src->flags = last_flags;
1337        src_ints = (uint32_t *)src;
1338        src_ints+=3;
1339        dst_ints = (volatile uint32_t *)dst;
1340        dst_ints+=3;
1341        *dst_ints =  *src_ints;
1342        tx->req += cnt;
1343        mb();
1344}
1345
1346static inline void
1347mxge_submit_req_wc(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1348{
1349    tx->req += cnt;
1350    mb();
1351    while (cnt >= 4) {
1352	    mxge_pio_copy((volatile char *)tx->wc_fifo, src, 64);
1353	    mb();
1354	    src += 4;
1355	    cnt -= 4;
1356    }
1357    if (cnt > 0) {
1358	    /* pad it to 64 bytes.  The src is 64 bytes bigger than it
1359	       needs to be so that we don't overrun it */
1360	    mxge_pio_copy(tx->wc_fifo + MXGEFW_ETH_SEND_OFFSET(cnt), src, 64);
1361	    mb();
1362    }
1363}
1364
1365static void
1366mxge_encap_tso(mxge_softc_t *sc, struct mbuf *m, int busdma_seg_cnt)
1367{
1368	mxge_tx_buf_t *tx;
1369	mcp_kreq_ether_send_t *req;
1370	bus_dma_segment_t *seg;
1371	struct ether_header *eh;
1372	struct ip *ip;
1373	struct tcphdr *tcp;
1374	uint32_t low, high_swapped;
1375	int len, seglen, cum_len, cum_len_next;
1376	int next_is_first, chop, cnt, rdma_count, small;
1377	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1378	uint8_t flags, flags_next;
1379	static int once;
1380
1381	mss = m->m_pkthdr.tso_segsz;
1382
1383	/* negative cum_len signifies to the
1384	 * send loop that we are still in the
1385	 * header portion of the TSO packet.
1386	 */
1387
1388	/* ensure we have the ethernet, IP and TCP
1389	   header together in the first mbuf, copy
1390	   it to a scratch buffer if not */
1391	if (__predict_false(m->m_len < sizeof (*eh)
1392			    + sizeof (*ip))) {
1393		m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1394			   sc->scratch);
1395		eh = (struct ether_header *)sc->scratch;
1396	} else {
1397		eh = mtod(m, struct ether_header *);
1398	}
1399	ip = (struct ip *) (eh + 1);
1400	if (__predict_false(m->m_len < sizeof (*eh) + (ip->ip_hl << 2)
1401			    + sizeof (*tcp))) {
1402		m_copydata(m, 0, sizeof (*eh) + (ip->ip_hl << 2)
1403			   + sizeof (*tcp),  sc->scratch);
1404		eh = (struct ether_header *) sc->scratch;
1405		ip = (struct ip *) (eh + 1);
1406	}
1407
1408	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1409	cum_len = -(sizeof (*eh) + ((ip->ip_hl + tcp->th_off) << 2));
1410
1411	/* TSO implies checksum offload on this hardware */
1412	cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1413	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1414
1415
1416	/* for TSO, pseudo_hdr_offset holds mss.
1417	 * The firmware figures out where to put
1418	 * the checksum by parsing the header. */
1419	pseudo_hdr_offset = htobe16(mss);
1420
1421	tx = &sc->tx;
1422	req = tx->req_list;
1423	seg = tx->seg_list;
1424	cnt = 0;
1425	rdma_count = 0;
1426	/* "rdma_count" is the number of RDMAs belonging to the
1427	 * current packet BEFORE the current send request. For
1428	 * non-TSO packets, this is equal to "count".
1429	 * For TSO packets, rdma_count needs to be reset
1430	 * to 0 after a segment cut.
1431	 *
1432	 * The rdma_count field of the send request is
1433	 * the number of RDMAs of the packet starting at
1434	 * that request. For TSO send requests with one ore more cuts
1435	 * in the middle, this is the number of RDMAs starting
1436	 * after the last cut in the request. All previous
1437	 * segments before the last cut implicitly have 1 RDMA.
1438	 *
1439	 * Since the number of RDMAs is not known beforehand,
1440	 * it must be filled-in retroactively - after each
1441	 * segmentation cut or at the end of the entire packet.
1442	 */
1443
1444	while (busdma_seg_cnt) {
1445		/* Break the busdma segment up into pieces*/
1446		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1447		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1448		len = seglen = seg->ds_len;
1449
1450		while (len) {
1451			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1452			cum_len_next = cum_len + seglen;
1453			(req-rdma_count)->rdma_count = rdma_count + 1;
1454			if (__predict_true(cum_len >= 0)) {
1455				/* payload */
1456				chop = (cum_len_next > mss);
1457				cum_len_next = cum_len_next % mss;
1458				next_is_first = (cum_len_next == 0);
1459				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1460				flags_next |= next_is_first *
1461					MXGEFW_FLAGS_FIRST;
1462				rdma_count |= -(chop | next_is_first);
1463				rdma_count += chop & !next_is_first;
1464			} else if (cum_len_next >= 0) {
1465				/* header ends */
1466				rdma_count = -1;
1467				cum_len_next = 0;
1468				seglen = -cum_len;
1469				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1470				flags_next = MXGEFW_FLAGS_TSO_PLD |
1471					MXGEFW_FLAGS_FIRST |
1472					(small * MXGEFW_FLAGS_SMALL);
1473			    }
1474
1475			req->addr_high = high_swapped;
1476			req->addr_low = htobe32(low);
1477			req->pseudo_hdr_offset = pseudo_hdr_offset;
1478			req->pad = 0;
1479			req->rdma_count = 1;
1480			req->length = htobe16(seglen);
1481			req->cksum_offset = cksum_offset;
1482			req->flags = flags | ((cum_len & 1) *
1483					      MXGEFW_FLAGS_ALIGN_ODD);
1484			low += seglen;
1485			len -= seglen;
1486			cum_len = cum_len_next;
1487			flags = flags_next;
1488			req++;
1489			cnt++;
1490			rdma_count++;
1491			if (__predict_false(cksum_offset > seglen))
1492				cksum_offset -= seglen;
1493			else
1494				cksum_offset = 0;
1495			if (__predict_false(cnt > MXGE_MAX_SEND_DESC))
1496				goto drop;
1497		}
1498		busdma_seg_cnt--;
1499		seg++;
1500	}
1501	(req-rdma_count)->rdma_count = rdma_count;
1502
1503	do {
1504		req--;
1505		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1506	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1507
1508	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1509	if (tx->wc_fifo == NULL)
1510		mxge_submit_req(tx, tx->req_list, cnt);
1511	else
1512		mxge_submit_req_wc(tx, tx->req_list, cnt);
1513	return;
1514
1515drop:
1516	m_freem(m);
1517	sc->ifp->if_oerrors++;
1518	if (!once) {
1519		printf("MXGE_MAX_SEND_DESC exceeded via TSO!\n");
1520		printf("mss = %d, %ld!\n", mss, (long)seg - (long)tx->seg_list);
1521		once = 1;
1522	}
1523	return;
1524
1525}
1526
1527static void
1528mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1529{
1530	mcp_kreq_ether_send_t *req;
1531	bus_dma_segment_t *seg;
1532	struct mbuf *m_tmp;
1533	struct ifnet *ifp;
1534	mxge_tx_buf_t *tx;
1535	struct ether_header *eh;
1536	struct ip *ip;
1537	int cnt, cum_len, err, i, idx, odd_flag;
1538	uint16_t pseudo_hdr_offset;
1539        uint8_t flags, cksum_offset;
1540
1541
1542
1543	ifp = sc->ifp;
1544	tx = &sc->tx;
1545
1546	/* (try to) map the frame for DMA */
1547	idx = tx->req & tx->mask;
1548	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1549				      m, tx->seg_list, &cnt,
1550				      BUS_DMA_NOWAIT);
1551	if (err == EFBIG) {
1552		/* Too many segments in the chain.  Try
1553		   to defrag */
1554		m_tmp = m_defrag(m, M_NOWAIT);
1555		if (m_tmp == NULL) {
1556			goto drop;
1557		}
1558		m = m_tmp;
1559		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1560					      tx->info[idx].map,
1561					      m, tx->seg_list, &cnt,
1562					      BUS_DMA_NOWAIT);
1563	}
1564	if (err != 0) {
1565		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1566			      " packet len = %d\n", err, m->m_pkthdr.len);
1567		goto drop;
1568	}
1569	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1570			BUS_DMASYNC_PREWRITE);
1571	tx->info[idx].m = m;
1572
1573
1574	/* TSO is different enough, we handle it in another routine */
1575	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1576		mxge_encap_tso(sc, m, cnt);
1577		return;
1578	}
1579
1580	req = tx->req_list;
1581	cksum_offset = 0;
1582	pseudo_hdr_offset = 0;
1583	flags = MXGEFW_FLAGS_NO_TSO;
1584
1585	/* checksum offloading? */
1586	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1587		/* ensure ip header is in first mbuf, copy
1588		   it to a scratch buffer if not */
1589		if (__predict_false(m->m_len < sizeof (*eh)
1590				    + sizeof (*ip))) {
1591			m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1592				   sc->scratch);
1593			eh = (struct ether_header *)sc->scratch;
1594		} else {
1595			eh = mtod(m, struct ether_header *);
1596		}
1597		ip = (struct ip *) (eh + 1);
1598		cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1599		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1600		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1601		req->cksum_offset = cksum_offset;
1602		flags |= MXGEFW_FLAGS_CKSUM;
1603		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1604	} else {
1605		odd_flag = 0;
1606	}
1607	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1608		flags |= MXGEFW_FLAGS_SMALL;
1609
1610	/* convert segments into a request list */
1611	cum_len = 0;
1612	seg = tx->seg_list;
1613	req->flags = MXGEFW_FLAGS_FIRST;
1614	for (i = 0; i < cnt; i++) {
1615		req->addr_low =
1616			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1617		req->addr_high =
1618			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1619		req->length = htobe16(seg->ds_len);
1620		req->cksum_offset = cksum_offset;
1621		if (cksum_offset > seg->ds_len)
1622			cksum_offset -= seg->ds_len;
1623		else
1624			cksum_offset = 0;
1625		req->pseudo_hdr_offset = pseudo_hdr_offset;
1626		req->pad = 0; /* complete solid 16-byte block */
1627		req->rdma_count = 1;
1628		req->flags |= flags | ((cum_len & 1) * odd_flag);
1629		cum_len += seg->ds_len;
1630		seg++;
1631		req++;
1632		req->flags = 0;
1633	}
1634	req--;
1635	/* pad runts to 60 bytes */
1636	if (cum_len < 60) {
1637		req++;
1638		req->addr_low =
1639			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1640		req->addr_high =
1641			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1642		req->length = htobe16(60 - cum_len);
1643		req->cksum_offset = 0;
1644		req->pseudo_hdr_offset = pseudo_hdr_offset;
1645		req->pad = 0; /* complete solid 16-byte block */
1646		req->rdma_count = 1;
1647		req->flags |= flags | ((cum_len & 1) * odd_flag);
1648		cnt++;
1649	}
1650
1651	tx->req_list[0].rdma_count = cnt;
1652#if 0
1653	/* print what the firmware will see */
1654	for (i = 0; i < cnt; i++) {
1655		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1656		    "cso:%d, flags:0x%x, rdma:%d\n",
1657		    i, (int)ntohl(tx->req_list[i].addr_high),
1658		    (int)ntohl(tx->req_list[i].addr_low),
1659		    (int)ntohs(tx->req_list[i].length),
1660		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1661		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1662		    tx->req_list[i].rdma_count);
1663	}
1664	printf("--------------\n");
1665#endif
1666	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1667	if (tx->wc_fifo == NULL)
1668		mxge_submit_req(tx, tx->req_list, cnt);
1669	else
1670		mxge_submit_req_wc(tx, tx->req_list, cnt);
1671	return;
1672
1673drop:
1674	m_freem(m);
1675	ifp->if_oerrors++;
1676	return;
1677}
1678
1679
1680
1681
1682static inline void
1683mxge_start_locked(mxge_softc_t *sc)
1684{
1685	struct mbuf *m;
1686	struct ifnet *ifp;
1687
1688	ifp = sc->ifp;
1689	while ((sc->tx.mask - (sc->tx.req - sc->tx.done))
1690	       > MXGE_MAX_SEND_DESC) {
1691
1692		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1693		if (m == NULL) {
1694			return;
1695		}
1696		/* let BPF see it */
1697		BPF_MTAP(ifp, m);
1698
1699		/* give it to the nic */
1700		mxge_encap(sc, m);
1701	}
1702	/* ran out of transmit slots */
1703	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
1704		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1705		sc->tx.stall++;
1706	}
1707}
1708
1709static void
1710mxge_start(struct ifnet *ifp)
1711{
1712	mxge_softc_t *sc = ifp->if_softc;
1713
1714
1715	mtx_lock(&sc->tx_mtx);
1716	mxge_start_locked(sc);
1717	mtx_unlock(&sc->tx_mtx);
1718}
1719
1720/*
1721 * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1722 * at most 32 bytes at a time, so as to avoid involving the software
1723 * pio handler in the nic.   We re-write the first segment's low
1724 * DMA address to mark it valid only after we write the entire chunk
1725 * in a burst
1726 */
1727static inline void
1728mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1729		mcp_kreq_ether_recv_t *src)
1730{
1731	uint32_t low;
1732
1733	low = src->addr_low;
1734	src->addr_low = 0xffffffff;
1735	mxge_pio_copy(dst, src, 4 * sizeof (*src));
1736	mb();
1737	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
1738	mb();
1739	dst->addr_low = low;
1740	mb();
1741}
1742
1743static int
1744mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1745{
1746	bus_dma_segment_t seg;
1747	struct mbuf *m;
1748	mxge_rx_buf_t *rx = &sc->rx_small;
1749	int cnt, err;
1750
1751	m = m_gethdr(M_DONTWAIT, MT_DATA);
1752	if (m == NULL) {
1753		rx->alloc_fail++;
1754		err = ENOBUFS;
1755		goto done;
1756	}
1757	m->m_len = MHLEN;
1758	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1759				      &seg, &cnt, BUS_DMA_NOWAIT);
1760	if (err != 0) {
1761		m_free(m);
1762		goto done;
1763	}
1764	rx->info[idx].m = m;
1765	rx->shadow[idx].addr_low =
1766		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1767	rx->shadow[idx].addr_high =
1768		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1769
1770done:
1771	if ((idx & 7) == 7) {
1772		if (rx->wc_fifo == NULL)
1773			mxge_submit_8rx(&rx->lanai[idx - 7],
1774					&rx->shadow[idx - 7]);
1775		else {
1776			mb();
1777			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1778		}
1779        }
1780	return err;
1781}
1782
1783static int
1784mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1785{
1786	bus_dma_segment_t seg;
1787	struct mbuf *m;
1788	mxge_rx_buf_t *rx = &sc->rx_big;
1789	int cnt, err;
1790
1791	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, sc->big_bytes);
1792	if (m == NULL) {
1793		rx->alloc_fail++;
1794		err = ENOBUFS;
1795		goto done;
1796	}
1797	m->m_len = sc->big_bytes;
1798	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1799				      &seg, &cnt, BUS_DMA_NOWAIT);
1800	if (err != 0) {
1801		m_free(m);
1802		goto done;
1803	}
1804	rx->info[idx].m = m;
1805	rx->shadow[idx].addr_low =
1806		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1807	rx->shadow[idx].addr_high =
1808		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1809
1810done:
1811	if ((idx & 7) == 7) {
1812		if (rx->wc_fifo == NULL)
1813			mxge_submit_8rx(&rx->lanai[idx - 7],
1814					&rx->shadow[idx - 7]);
1815		else {
1816			mb();
1817			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1818		}
1819        }
1820	return err;
1821}
1822
1823static inline void
1824mxge_rx_csum(struct mbuf *m, int csum)
1825{
1826	struct ether_header *eh;
1827	struct ip *ip;
1828
1829	eh = mtod(m, struct ether_header *);
1830	if (__predict_true(eh->ether_type ==  htons(ETHERTYPE_IP))) {
1831		ip = (struct ip *)(eh + 1);
1832		if (__predict_true(ip->ip_p == IPPROTO_TCP ||
1833				   ip->ip_p == IPPROTO_UDP)) {
1834			m->m_pkthdr.csum_data = csum;
1835			m->m_pkthdr.csum_flags = CSUM_DATA_VALID;
1836		}
1837	}
1838}
1839
1840static inline void
1841mxge_rx_done_big(mxge_softc_t *sc, int len, int csum)
1842{
1843	struct ifnet *ifp;
1844	struct mbuf *m = 0; 		/* -Wunitialized */
1845	struct mbuf *m_prev = 0;	/* -Wunitialized */
1846	struct mbuf *m_head = 0;
1847	bus_dmamap_t old_map;
1848	mxge_rx_buf_t *rx;
1849	int idx;
1850
1851
1852	rx = &sc->rx_big;
1853	ifp = sc->ifp;
1854	while (len > 0) {
1855		idx = rx->cnt & rx->mask;
1856                rx->cnt++;
1857		/* save a pointer to the received mbuf */
1858		m = rx->info[idx].m;
1859		/* try to replace the received mbuf */
1860		if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
1861			goto drop;
1862		}
1863		/* unmap the received buffer */
1864		old_map = rx->info[idx].map;
1865		bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1866		bus_dmamap_unload(rx->dmat, old_map);
1867
1868		/* swap the bus_dmamap_t's */
1869		rx->info[idx].map = rx->extra_map;
1870		rx->extra_map = old_map;
1871
1872		/* chain multiple segments together */
1873		if (!m_head) {
1874			m_head = m;
1875			/* mcp implicitly skips 1st bytes so that
1876			 * packet is properly aligned */
1877			m->m_data += MXGEFW_PAD;
1878			m->m_pkthdr.len = len;
1879			m->m_len = sc->big_bytes - MXGEFW_PAD;
1880		} else {
1881			m->m_len = sc->big_bytes;
1882			m->m_flags &= ~M_PKTHDR;
1883			m_prev->m_next = m;
1884		}
1885		len -= m->m_len;
1886		m_prev = m;
1887	}
1888
1889	/* trim trailing garbage from the last mbuf in the chain.  If
1890	 * there is any garbage, len will be negative */
1891	m->m_len += len;
1892
1893	/* if the checksum is valid, mark it in the mbuf header */
1894	if (sc->csum_flag)
1895		mxge_rx_csum(m_head, csum);
1896
1897	/* pass the frame up the stack */
1898	m_head->m_pkthdr.rcvif = ifp;
1899	ifp->if_ipackets++;
1900	(*ifp->if_input)(ifp, m_head);
1901	return;
1902
1903drop:
1904	/* drop the frame -- the old mbuf(s) are re-cycled by running
1905	   every slot through the allocator */
1906        if (m_head) {
1907                len -= sc->big_bytes;
1908                m_freem(m_head);
1909        } else {
1910                len -= (sc->big_bytes + MXGEFW_PAD);
1911        }
1912        while ((int)len > 0) {
1913                idx = rx->cnt & rx->mask;
1914                rx->cnt++;
1915                m = rx->info[idx].m;
1916                if (0 == (mxge_get_buf_big(sc, rx->extra_map, idx))) {
1917			m_freem(m);
1918			/* unmap the received buffer */
1919			old_map = rx->info[idx].map;
1920			bus_dmamap_sync(rx->dmat, old_map,
1921					BUS_DMASYNC_POSTREAD);
1922			bus_dmamap_unload(rx->dmat, old_map);
1923
1924			/* swap the bus_dmamap_t's */
1925			rx->info[idx].map = rx->extra_map;
1926			rx->extra_map = old_map;
1927		}
1928                len -= sc->big_bytes;
1929        }
1930
1931	ifp->if_ierrors++;
1932
1933}
1934
1935static inline void
1936mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
1937{
1938	struct ifnet *ifp;
1939	struct mbuf *m;
1940	mxge_rx_buf_t *rx;
1941	bus_dmamap_t old_map;
1942	int idx;
1943
1944	ifp = sc->ifp;
1945	rx = &sc->rx_small;
1946	idx = rx->cnt & rx->mask;
1947	rx->cnt++;
1948	/* save a pointer to the received mbuf */
1949	m = rx->info[idx].m;
1950	/* try to replace the received mbuf */
1951	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
1952		/* drop the frame -- the old mbuf is re-cycled */
1953		ifp->if_ierrors++;
1954		return;
1955	}
1956
1957	/* unmap the received buffer */
1958	old_map = rx->info[idx].map;
1959	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1960	bus_dmamap_unload(rx->dmat, old_map);
1961
1962	/* swap the bus_dmamap_t's */
1963	rx->info[idx].map = rx->extra_map;
1964	rx->extra_map = old_map;
1965
1966	/* mcp implicitly skips 1st 2 bytes so that packet is properly
1967	 * aligned */
1968	m->m_data += MXGEFW_PAD;
1969
1970	/* if the checksum is valid, mark it in the mbuf header */
1971	if (sc->csum_flag)
1972		mxge_rx_csum(m, csum);
1973
1974	/* pass the frame up the stack */
1975	m->m_pkthdr.rcvif = ifp;
1976	m->m_len = m->m_pkthdr.len = len;
1977	ifp->if_ipackets++;
1978	(*ifp->if_input)(ifp, m);
1979}
1980
1981static inline void
1982mxge_clean_rx_done(mxge_softc_t *sc)
1983{
1984	mxge_rx_done_t *rx_done = &sc->rx_done;
1985	int limit = 0;
1986	uint16_t length;
1987	uint16_t checksum;
1988
1989
1990	while (rx_done->entry[rx_done->idx].length != 0) {
1991		length = ntohs(rx_done->entry[rx_done->idx].length);
1992		rx_done->entry[rx_done->idx].length = 0;
1993		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
1994		if (length <= (MHLEN - MXGEFW_PAD))
1995			mxge_rx_done_small(sc, length, checksum);
1996		else
1997			mxge_rx_done_big(sc, length, checksum);
1998		rx_done->cnt++;
1999		rx_done->idx = rx_done->cnt & (mxge_max_intr_slots - 1);
2000
2001		/* limit potential for livelock */
2002		if (__predict_false(++limit > 2 * mxge_max_intr_slots))
2003			break;
2004
2005	}
2006}
2007
2008
2009static inline void
2010mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
2011{
2012	struct ifnet *ifp;
2013	mxge_tx_buf_t *tx;
2014	struct mbuf *m;
2015	bus_dmamap_t map;
2016	int idx, limit;
2017
2018	limit = 0;
2019	tx = &sc->tx;
2020	ifp = sc->ifp;
2021	while (tx->pkt_done != mcp_idx) {
2022		idx = tx->done & tx->mask;
2023		tx->done++;
2024		m = tx->info[idx].m;
2025		/* mbuf and DMA map only attached to the first
2026		   segment per-mbuf */
2027		if (m != NULL) {
2028			ifp->if_opackets++;
2029			tx->info[idx].m = NULL;
2030			map = tx->info[idx].map;
2031			bus_dmamap_unload(tx->dmat, map);
2032			m_freem(m);
2033		}
2034		if (tx->info[idx].flag) {
2035			tx->info[idx].flag = 0;
2036			tx->pkt_done++;
2037		}
2038		/* limit potential for livelock by only handling
2039		   2 full tx rings per call */
2040		if (__predict_false(++limit >  2 * tx->mask))
2041			break;
2042	}
2043
2044	/* If we have space, clear IFF_OACTIVE to tell the stack that
2045           its OK to send packets */
2046
2047	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2048	    tx->req - tx->done < (tx->mask + 1)/4) {
2049		mtx_lock(&sc->tx_mtx);
2050		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2051		sc->tx.wake++;
2052		mxge_start_locked(sc);
2053		mtx_unlock(&sc->tx_mtx);
2054	}
2055}
2056
2057static void
2058mxge_intr(void *arg)
2059{
2060	mxge_softc_t *sc = arg;
2061	mcp_irq_data_t *stats = sc->fw_stats;
2062	mxge_tx_buf_t *tx = &sc->tx;
2063	mxge_rx_done_t *rx_done = &sc->rx_done;
2064	uint32_t send_done_count;
2065	uint8_t valid;
2066
2067
2068	/* make sure the DMA has finished */
2069	if (!stats->valid) {
2070		return;
2071	}
2072	valid = stats->valid;
2073
2074	if (!sc->msi_enabled) {
2075		/* lower legacy IRQ  */
2076		*sc->irq_deassert = 0;
2077		if (!mxge_deassert_wait)
2078			/* don't wait for conf. that irq is low */
2079			stats->valid = 0;
2080	} else {
2081		stats->valid = 0;
2082	}
2083
2084	/* loop while waiting for legacy irq deassertion */
2085	do {
2086		/* check for transmit completes and receives */
2087		send_done_count = be32toh(stats->send_done_count);
2088		while ((send_done_count != tx->pkt_done) ||
2089		       (rx_done->entry[rx_done->idx].length != 0)) {
2090			mxge_tx_done(sc, (int)send_done_count);
2091			mxge_clean_rx_done(sc);
2092			send_done_count = be32toh(stats->send_done_count);
2093		}
2094	} while (*((volatile uint8_t *) &stats->valid));
2095
2096	if (__predict_false(stats->stats_updated)) {
2097		if (sc->link_state != stats->link_up) {
2098			sc->link_state = stats->link_up;
2099			if (sc->link_state) {
2100				if_link_state_change(sc->ifp, LINK_STATE_UP);
2101				if (mxge_verbose)
2102					device_printf(sc->dev, "link up\n");
2103			} else {
2104				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2105				if (mxge_verbose)
2106					device_printf(sc->dev, "link down\n");
2107			}
2108		}
2109		if (sc->rdma_tags_available !=
2110		    be32toh(sc->fw_stats->rdma_tags_available)) {
2111			sc->rdma_tags_available =
2112				be32toh(sc->fw_stats->rdma_tags_available);
2113			device_printf(sc->dev, "RDMA timed out! %d tags "
2114				      "left\n", sc->rdma_tags_available);
2115		}
2116		sc->down_cnt += stats->link_down;
2117	}
2118
2119	/* check to see if we have rx token to pass back */
2120	if (valid & 0x1)
2121	    *sc->irq_claim = be32toh(3);
2122	*(sc->irq_claim + 1) = be32toh(3);
2123}
2124
2125static void
2126mxge_init(void *arg)
2127{
2128}
2129
2130
2131
2132static void
2133mxge_free_mbufs(mxge_softc_t *sc)
2134{
2135	int i;
2136
2137	for (i = 0; i <= sc->rx_big.mask; i++) {
2138		if (sc->rx_big.info[i].m == NULL)
2139			continue;
2140		bus_dmamap_unload(sc->rx_big.dmat,
2141				  sc->rx_big.info[i].map);
2142		m_freem(sc->rx_big.info[i].m);
2143		sc->rx_big.info[i].m = NULL;
2144	}
2145
2146	for (i = 0; i <= sc->rx_big.mask; i++) {
2147		if (sc->rx_big.info[i].m == NULL)
2148			continue;
2149		bus_dmamap_unload(sc->rx_big.dmat,
2150				  sc->rx_big.info[i].map);
2151		m_freem(sc->rx_big.info[i].m);
2152		sc->rx_big.info[i].m = NULL;
2153	}
2154
2155	for (i = 0; i <= sc->tx.mask; i++) {
2156		sc->tx.info[i].flag = 0;
2157		if (sc->tx.info[i].m == NULL)
2158			continue;
2159		bus_dmamap_unload(sc->tx.dmat,
2160				  sc->tx.info[i].map);
2161		m_freem(sc->tx.info[i].m);
2162		sc->tx.info[i].m = NULL;
2163	}
2164}
2165
2166static void
2167mxge_free_rings(mxge_softc_t *sc)
2168{
2169	int i;
2170
2171	if (sc->tx.req_bytes != NULL)
2172		free(sc->tx.req_bytes, M_DEVBUF);
2173	if (sc->tx.seg_list != NULL)
2174		free(sc->tx.seg_list, M_DEVBUF);
2175	if (sc->rx_small.shadow != NULL)
2176		free(sc->rx_small.shadow, M_DEVBUF);
2177	if (sc->rx_big.shadow != NULL)
2178		free(sc->rx_big.shadow, M_DEVBUF);
2179	if (sc->tx.info != NULL) {
2180		if (sc->tx.dmat != NULL) {
2181			for (i = 0; i <= sc->tx.mask; i++) {
2182				bus_dmamap_destroy(sc->tx.dmat,
2183						   sc->tx.info[i].map);
2184			}
2185			bus_dma_tag_destroy(sc->tx.dmat);
2186		}
2187		free(sc->tx.info, M_DEVBUF);
2188	}
2189	if (sc->rx_small.info != NULL) {
2190		if (sc->rx_small.dmat != NULL) {
2191			for (i = 0; i <= sc->rx_small.mask; i++) {
2192				bus_dmamap_destroy(sc->rx_small.dmat,
2193						   sc->rx_small.info[i].map);
2194			}
2195			bus_dmamap_destroy(sc->rx_small.dmat,
2196					   sc->rx_small.extra_map);
2197			bus_dma_tag_destroy(sc->rx_small.dmat);
2198		}
2199		free(sc->rx_small.info, M_DEVBUF);
2200	}
2201	if (sc->rx_big.info != NULL) {
2202		if (sc->rx_big.dmat != NULL) {
2203			for (i = 0; i <= sc->rx_big.mask; i++) {
2204				bus_dmamap_destroy(sc->rx_big.dmat,
2205						   sc->rx_big.info[i].map);
2206			}
2207			bus_dmamap_destroy(sc->rx_big.dmat,
2208					   sc->rx_big.extra_map);
2209			bus_dma_tag_destroy(sc->rx_big.dmat);
2210		}
2211		free(sc->rx_big.info, M_DEVBUF);
2212	}
2213}
2214
2215static int
2216mxge_alloc_rings(mxge_softc_t *sc)
2217{
2218	mxge_cmd_t cmd;
2219	int tx_ring_size, rx_ring_size;
2220	int tx_ring_entries, rx_ring_entries;
2221	int i, err;
2222	unsigned long bytes;
2223
2224	/* get ring sizes */
2225	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2226	tx_ring_size = cmd.data0;
2227	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
2228	if (err != 0) {
2229		device_printf(sc->dev, "Cannot determine ring sizes\n");
2230		goto abort_with_nothing;
2231	}
2232
2233	rx_ring_size = cmd.data0;
2234
2235	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
2236	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
2237	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
2238	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
2239	IFQ_SET_READY(&sc->ifp->if_snd);
2240
2241	sc->tx.mask = tx_ring_entries - 1;
2242	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
2243
2244	err = ENOMEM;
2245
2246	/* allocate the tx request copy block */
2247	bytes = 8 +
2248		sizeof (*sc->tx.req_list) * (MXGE_MAX_SEND_DESC + 4);
2249	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2250	if (sc->tx.req_bytes == NULL)
2251		goto abort_with_nothing;
2252	/* ensure req_list entries are aligned to 8 bytes */
2253	sc->tx.req_list = (mcp_kreq_ether_send_t *)
2254		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
2255
2256	/* allocate the tx busdma segment list */
2257	bytes = sizeof (*sc->tx.seg_list) * MXGE_MAX_SEND_DESC;
2258	sc->tx.seg_list = (bus_dma_segment_t *)
2259		malloc(bytes, M_DEVBUF, M_WAITOK);
2260	if (sc->tx.seg_list == NULL)
2261		goto abort_with_alloc;
2262
2263	/* allocate the rx shadow rings */
2264	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
2265	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2266	if (sc->rx_small.shadow == NULL)
2267		goto abort_with_alloc;
2268
2269	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
2270	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2271	if (sc->rx_big.shadow == NULL)
2272		goto abort_with_alloc;
2273
2274	/* allocate the host info rings */
2275	bytes = tx_ring_entries * sizeof (*sc->tx.info);
2276	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2277	if (sc->tx.info == NULL)
2278		goto abort_with_alloc;
2279
2280	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
2281	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2282	if (sc->rx_small.info == NULL)
2283		goto abort_with_alloc;
2284
2285	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
2286	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2287	if (sc->rx_big.info == NULL)
2288		goto abort_with_alloc;
2289
2290	/* allocate the busdma resources */
2291	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2292				 1,			/* alignment */
2293				 sc->tx.boundary,	/* boundary */
2294				 BUS_SPACE_MAXADDR,	/* low */
2295				 BUS_SPACE_MAXADDR,	/* high */
2296				 NULL, NULL,		/* filter */
2297				 65536 + 256,		/* maxsize */
2298				 MXGE_MAX_SEND_DESC/2,	/* num segs */
2299				 sc->tx.boundary,	/* maxsegsize */
2300				 BUS_DMA_ALLOCNOW,	/* flags */
2301				 NULL, NULL,		/* lock */
2302				 &sc->tx.dmat);		/* tag */
2303
2304	if (err != 0) {
2305		device_printf(sc->dev, "Err %d allocating tx dmat\n",
2306			      err);
2307		goto abort_with_alloc;
2308	}
2309
2310	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2311				 1,			/* alignment */
2312				 4096,			/* boundary */
2313				 BUS_SPACE_MAXADDR,	/* low */
2314				 BUS_SPACE_MAXADDR,	/* high */
2315				 NULL, NULL,		/* filter */
2316				 MHLEN,			/* maxsize */
2317				 1,			/* num segs */
2318				 MHLEN,			/* maxsegsize */
2319				 BUS_DMA_ALLOCNOW,	/* flags */
2320				 NULL, NULL,		/* lock */
2321				 &sc->rx_small.dmat);	/* tag */
2322	if (err != 0) {
2323		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2324			      err);
2325		goto abort_with_alloc;
2326	}
2327
2328	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2329				 1,			/* alignment */
2330				 4096,			/* boundary */
2331				 BUS_SPACE_MAXADDR,	/* low */
2332				 BUS_SPACE_MAXADDR,	/* high */
2333				 NULL, NULL,		/* filter */
2334				 4096,			/* maxsize */
2335				 1,			/* num segs */
2336				 4096,			/* maxsegsize */
2337				 BUS_DMA_ALLOCNOW,	/* flags */
2338				 NULL, NULL,		/* lock */
2339				 &sc->rx_big.dmat);	/* tag */
2340	if (err != 0) {
2341		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2342			      err);
2343		goto abort_with_alloc;
2344	}
2345
2346	/* now use these tags to setup dmamaps for each slot
2347	   in each ring */
2348	for (i = 0; i <= sc->tx.mask; i++) {
2349		err = bus_dmamap_create(sc->tx.dmat, 0,
2350					&sc->tx.info[i].map);
2351		if (err != 0) {
2352			device_printf(sc->dev, "Err %d  tx dmamap\n",
2353			      err);
2354			goto abort_with_alloc;
2355		}
2356	}
2357	for (i = 0; i <= sc->rx_small.mask; i++) {
2358		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2359					&sc->rx_small.info[i].map);
2360		if (err != 0) {
2361			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2362				      err);
2363			goto abort_with_alloc;
2364		}
2365	}
2366	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2367				&sc->rx_small.extra_map);
2368	if (err != 0) {
2369		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2370			      err);
2371			goto abort_with_alloc;
2372	}
2373
2374	for (i = 0; i <= sc->rx_big.mask; i++) {
2375		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2376					&sc->rx_big.info[i].map);
2377		if (err != 0) {
2378			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2379			      err);
2380			goto abort_with_alloc;
2381		}
2382	}
2383	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2384				&sc->rx_big.extra_map);
2385	if (err != 0) {
2386		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2387			      err);
2388			goto abort_with_alloc;
2389	}
2390	return 0;
2391
2392abort_with_alloc:
2393	mxge_free_rings(sc);
2394
2395abort_with_nothing:
2396	return err;
2397}
2398
2399static int
2400mxge_open(mxge_softc_t *sc)
2401{
2402	mxge_cmd_t cmd;
2403	int i, err;
2404	bus_dmamap_t map;
2405	bus_addr_t bus;
2406
2407
2408	/* Copy the MAC address in case it was overridden */
2409	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
2410
2411	err = mxge_reset(sc);
2412	if (err != 0) {
2413		device_printf(sc->dev, "failed to reset\n");
2414		return EIO;
2415	}
2416	bzero(sc->rx_done.entry,
2417	      mxge_max_intr_slots * sizeof(*sc->rx_done.entry));
2418
2419	if (MCLBYTES >=
2420	    sc->ifp->if_mtu + ETHER_HDR_LEN + MXGEFW_PAD)
2421		sc->big_bytes = MCLBYTES;
2422	else
2423		sc->big_bytes = MJUMPAGESIZE;
2424
2425
2426	/* get the lanai pointers to the send and receive rings */
2427
2428	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2429	sc->tx.lanai =
2430		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2431	err |= mxge_send_cmd(sc,
2432				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2433	sc->rx_small.lanai =
2434		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2435	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2436	sc->rx_big.lanai =
2437		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2438
2439	if (err != 0) {
2440		device_printf(sc->dev,
2441			      "failed to get ring sizes or locations\n");
2442		return EIO;
2443	}
2444
2445	if (sc->wc) {
2446		sc->tx.wc_fifo = sc->sram + MXGEFW_ETH_SEND_4;
2447		sc->rx_small.wc_fifo = sc->sram + MXGEFW_ETH_RECV_SMALL;
2448		sc->rx_big.wc_fifo = sc->sram + MXGEFW_ETH_RECV_BIG;
2449	} else {
2450		sc->tx.wc_fifo = 0;
2451		sc->rx_small.wc_fifo = 0;
2452		sc->rx_big.wc_fifo = 0;
2453	}
2454
2455
2456	/* stock receive rings */
2457	for (i = 0; i <= sc->rx_small.mask; i++) {
2458		map = sc->rx_small.info[i].map;
2459		err = mxge_get_buf_small(sc, map, i);
2460		if (err) {
2461			device_printf(sc->dev, "alloced %d/%d smalls\n",
2462				      i, sc->rx_small.mask + 1);
2463			goto abort;
2464		}
2465	}
2466	for (i = 0; i <= sc->rx_big.mask; i++) {
2467		map = sc->rx_big.info[i].map;
2468		err = mxge_get_buf_big(sc, map, i);
2469		if (err) {
2470			device_printf(sc->dev, "alloced %d/%d bigs\n",
2471				      i, sc->rx_big.mask + 1);
2472			goto abort;
2473		}
2474	}
2475
2476	/* Give the firmware the mtu and the big and small buffer
2477	   sizes.  The firmware wants the big buf size to be a power
2478	   of two. Luckily, FreeBSD's clusters are powers of two */
2479	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN;
2480	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2481	cmd.data0 = MHLEN - MXGEFW_PAD;
2482	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2483			     &cmd);
2484	cmd.data0 = sc->big_bytes;
2485	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2486
2487	if (err != 0) {
2488		device_printf(sc->dev, "failed to setup params\n");
2489		goto abort;
2490	}
2491
2492	/* Now give him the pointer to the stats block */
2493	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
2494	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
2495	cmd.data2 = sizeof(struct mcp_irq_data);
2496	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
2497
2498	if (err != 0) {
2499		bus = sc->fw_stats_dma.bus_addr;
2500		bus += offsetof(struct mcp_irq_data, send_done_count);
2501		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
2502		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
2503		err = mxge_send_cmd(sc,
2504				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
2505				    &cmd);
2506		/* Firmware cannot support multicast without STATS_DMA_V2 */
2507		sc->fw_multicast_support = 0;
2508	} else {
2509		sc->fw_multicast_support = 1;
2510	}
2511
2512	if (err != 0) {
2513		device_printf(sc->dev, "failed to setup params\n");
2514		goto abort;
2515	}
2516
2517	/* Finally, start the firmware running */
2518	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
2519	if (err) {
2520		device_printf(sc->dev, "Couldn't bring up link\n");
2521		goto abort;
2522	}
2523	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
2524	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2525
2526	return 0;
2527
2528
2529abort:
2530	mxge_free_mbufs(sc);
2531
2532	return err;
2533}
2534
2535static int
2536mxge_close(mxge_softc_t *sc)
2537{
2538	mxge_cmd_t cmd;
2539	int err, old_down_cnt;
2540
2541	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2542	old_down_cnt = sc->down_cnt;
2543	mb();
2544	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2545	if (err) {
2546		device_printf(sc->dev, "Couldn't bring down link\n");
2547	}
2548	if (old_down_cnt == sc->down_cnt) {
2549		/* wait for down irq */
2550		DELAY(10 * sc->intr_coal_delay);
2551	}
2552	if (old_down_cnt == sc->down_cnt) {
2553		device_printf(sc->dev, "never got down irq\n");
2554	}
2555
2556	mxge_free_mbufs(sc);
2557
2558	return 0;
2559}
2560
2561static void
2562mxge_setup_cfg_space(mxge_softc_t *sc)
2563{
2564	device_t dev = sc->dev;
2565	int reg;
2566	uint16_t cmd, lnk, pectl;
2567
2568	/* find the PCIe link width and set max read request to 4KB*/
2569	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
2570		lnk = pci_read_config(dev, reg + 0x12, 2);
2571		sc->link_width = (lnk >> 4) & 0x3f;
2572
2573		pectl = pci_read_config(dev, reg + 0x8, 2);
2574		pectl = (pectl & ~0x7000) | (5 << 12);
2575		pci_write_config(dev, reg + 0x8, pectl, 2);
2576	}
2577
2578	/* Enable DMA and Memory space access */
2579	pci_enable_busmaster(dev);
2580	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
2581	cmd |= PCIM_CMD_MEMEN;
2582	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
2583}
2584
2585static uint32_t
2586mxge_read_reboot(mxge_softc_t *sc)
2587{
2588	device_t dev = sc->dev;
2589	uint32_t vs;
2590
2591	/* find the vendor specific offset */
2592	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
2593		device_printf(sc->dev,
2594			      "could not find vendor specific offset\n");
2595		return (uint32_t)-1;
2596	}
2597	/* enable read32 mode */
2598	pci_write_config(dev, vs + 0x10, 0x3, 1);
2599	/* tell NIC which register to read */
2600	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
2601	return (pci_read_config(dev, vs + 0x14, 4));
2602}
2603
2604static void
2605mxge_watchdog_reset(mxge_softc_t *sc)
2606{
2607	int err;
2608	uint32_t reboot;
2609	uint16_t cmd;
2610
2611	err = ENXIO;
2612
2613	device_printf(sc->dev, "Watchdog reset!\n");
2614
2615	/*
2616	 * check to see if the NIC rebooted.  If it did, then all of
2617	 * PCI config space has been reset, and things like the
2618	 * busmaster bit will be zero.  If this is the case, then we
2619	 * must restore PCI config space before the NIC can be used
2620	 * again
2621	 */
2622	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2623	if (cmd == 0xffff) {
2624		/*
2625		 * maybe the watchdog caught the NIC rebooting; wait
2626		 * up to 100ms for it to finish.  If it does not come
2627		 * back, then give up
2628		 */
2629		DELAY(1000*100);
2630		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2631		if (cmd == 0xffff) {
2632			device_printf(sc->dev, "NIC disappeared!\n");
2633			goto abort;
2634		}
2635	}
2636	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
2637		/* print the reboot status */
2638		reboot = mxge_read_reboot(sc);
2639		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
2640			      reboot);
2641		/* restore PCI configuration space */
2642
2643		/* XXXX waiting for pci_cfg_restore() to be exported */
2644		goto abort; /* just abort for now */
2645
2646		/* and redo any changes we made to our config space */
2647		mxge_setup_cfg_space(sc);
2648	} else {
2649		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
2650		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
2651			      sc->tx.req, sc->tx.done);
2652		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
2653			      sc->tx.pkt_done,
2654			      be32toh(sc->fw_stats->send_done_count));
2655	}
2656
2657	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
2658		mxge_close(sc);
2659		err = mxge_open(sc);
2660	}
2661
2662abort:
2663	/*
2664	 * stop the watchdog if the nic is dead, to avoid spamming the
2665	 * console
2666	 */
2667	if (err != 0) {
2668		callout_stop(&sc->co_hdl);
2669	}
2670}
2671
2672static void
2673mxge_watchdog(mxge_softc_t *sc)
2674{
2675	mxge_tx_buf_t *tx = &sc->tx;
2676
2677	/* see if we have outstanding transmits, which
2678	   have been pending for more than mxge_ticks */
2679	if (tx->req != tx->done &&
2680	    tx->watchdog_req != tx->watchdog_done &&
2681	    tx->done == tx->watchdog_done)
2682		mxge_watchdog_reset(sc);
2683
2684	tx->watchdog_req = tx->req;
2685	tx->watchdog_done = tx->done;
2686}
2687
2688static void
2689mxge_tick(void *arg)
2690{
2691	mxge_softc_t *sc = arg;
2692
2693
2694	/* Synchronize with possible callout reset/stop. */
2695	if (callout_pending(&sc->co_hdl) ||
2696	    !callout_active(&sc->co_hdl)) {
2697		mtx_unlock(&sc->driver_mtx);
2698		return;
2699	}
2700
2701	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2702	mxge_watchdog(sc);
2703}
2704
2705static int
2706mxge_media_change(struct ifnet *ifp)
2707{
2708	return EINVAL;
2709}
2710
2711static int
2712mxge_change_mtu(mxge_softc_t *sc, int mtu)
2713{
2714	struct ifnet *ifp = sc->ifp;
2715	int real_mtu, old_mtu;
2716	int err = 0;
2717
2718
2719	real_mtu = mtu + ETHER_HDR_LEN;
2720	if ((real_mtu > MXGE_MAX_ETHER_MTU) ||
2721	    real_mtu < 60)
2722		return EINVAL;
2723	mtx_lock(&sc->driver_mtx);
2724	old_mtu = ifp->if_mtu;
2725	ifp->if_mtu = mtu;
2726	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2727		callout_stop(&sc->co_hdl);
2728		mxge_close(sc);
2729		err = mxge_open(sc);
2730		if (err != 0) {
2731			ifp->if_mtu = old_mtu;
2732			mxge_close(sc);
2733			(void) mxge_open(sc);
2734		}
2735		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2736	}
2737	mtx_unlock(&sc->driver_mtx);
2738	return err;
2739}
2740
2741static void
2742mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
2743{
2744	mxge_softc_t *sc = ifp->if_softc;
2745
2746
2747	if (sc == NULL)
2748		return;
2749	ifmr->ifm_status = IFM_AVALID;
2750	ifmr->ifm_status |= sc->fw_stats->link_up ? IFM_ACTIVE : 0;
2751	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
2752	ifmr->ifm_active |= sc->fw_stats->link_up ? IFM_FDX : 0;
2753}
2754
2755static int
2756mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
2757{
2758	mxge_softc_t *sc = ifp->if_softc;
2759	struct ifreq *ifr = (struct ifreq *)data;
2760	int err, mask;
2761
2762	err = 0;
2763	switch (command) {
2764	case SIOCSIFADDR:
2765	case SIOCGIFADDR:
2766		err = ether_ioctl(ifp, command, data);
2767		break;
2768
2769	case SIOCSIFMTU:
2770		err = mxge_change_mtu(sc, ifr->ifr_mtu);
2771		break;
2772
2773	case SIOCSIFFLAGS:
2774		mtx_lock(&sc->driver_mtx);
2775		if (ifp->if_flags & IFF_UP) {
2776			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
2777				err = mxge_open(sc);
2778				callout_reset(&sc->co_hdl, mxge_ticks,
2779					      mxge_tick, sc);
2780			} else {
2781				/* take care of promis can allmulti
2782				   flag chages */
2783				mxge_change_promisc(sc,
2784						    ifp->if_flags & IFF_PROMISC);
2785				mxge_set_multicast_list(sc);
2786			}
2787		} else {
2788			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2789				mxge_close(sc);
2790				callout_stop(&sc->co_hdl);
2791			}
2792		}
2793		mtx_unlock(&sc->driver_mtx);
2794		break;
2795
2796	case SIOCADDMULTI:
2797	case SIOCDELMULTI:
2798		mtx_lock(&sc->driver_mtx);
2799		mxge_set_multicast_list(sc);
2800		mtx_unlock(&sc->driver_mtx);
2801		break;
2802
2803	case SIOCSIFCAP:
2804		mtx_lock(&sc->driver_mtx);
2805		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2806		if (mask & IFCAP_TXCSUM) {
2807			if (IFCAP_TXCSUM & ifp->if_capenable) {
2808				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
2809				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
2810						      | CSUM_TSO);
2811			} else {
2812				ifp->if_capenable |= IFCAP_TXCSUM;
2813				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
2814			}
2815		} else if (mask & IFCAP_RXCSUM) {
2816			if (IFCAP_RXCSUM & ifp->if_capenable) {
2817				ifp->if_capenable &= ~IFCAP_RXCSUM;
2818				sc->csum_flag = 0;
2819			} else {
2820				ifp->if_capenable |= IFCAP_RXCSUM;
2821				sc->csum_flag = 1;
2822			}
2823		}
2824		if (mask & IFCAP_TSO4) {
2825			if (IFCAP_TSO4 & ifp->if_capenable) {
2826				ifp->if_capenable &= ~IFCAP_TSO4;
2827				ifp->if_hwassist &= ~CSUM_TSO;
2828			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
2829				ifp->if_capenable |= IFCAP_TSO4;
2830				ifp->if_hwassist |= CSUM_TSO;
2831			} else {
2832				printf("mxge requires tx checksum offload"
2833				       " be enabled to use TSO\n");
2834				err = EINVAL;
2835			}
2836		}
2837		mtx_unlock(&sc->driver_mtx);
2838		break;
2839
2840	case SIOCGIFMEDIA:
2841		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
2842				    &sc->media, command);
2843                break;
2844
2845	default:
2846		err = ENOTTY;
2847        }
2848	return err;
2849}
2850
2851static void
2852mxge_fetch_tunables(mxge_softc_t *sc)
2853{
2854
2855	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
2856			  &mxge_flow_control);
2857	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
2858			  &mxge_intr_coal_delay);
2859	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
2860			  &mxge_nvidia_ecrc_enable);
2861	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
2862			  &mxge_force_firmware);
2863	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
2864			  &mxge_deassert_wait);
2865	TUNABLE_INT_FETCH("hw.mxge.verbose",
2866			  &mxge_verbose);
2867	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
2868
2869	if (bootverbose)
2870		mxge_verbose = 1;
2871	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
2872		mxge_intr_coal_delay = 30;
2873	if (mxge_ticks == 0)
2874		mxge_ticks = hz;
2875	sc->pause = mxge_flow_control;
2876}
2877
2878static int
2879mxge_attach(device_t dev)
2880{
2881	mxge_softc_t *sc = device_get_softc(dev);
2882	struct ifnet *ifp;
2883	size_t bytes;
2884	int count, rid, err;
2885
2886	sc->dev = dev;
2887	mxge_fetch_tunables(sc);
2888
2889	err = bus_dma_tag_create(NULL,			/* parent */
2890				 1,			/* alignment */
2891				 4096,			/* boundary */
2892				 BUS_SPACE_MAXADDR,	/* low */
2893				 BUS_SPACE_MAXADDR,	/* high */
2894				 NULL, NULL,		/* filter */
2895				 65536 + 256,		/* maxsize */
2896				 MXGE_MAX_SEND_DESC, 	/* num segs */
2897				 4096,			/* maxsegsize */
2898				 0,			/* flags */
2899				 NULL, NULL,		/* lock */
2900				 &sc->parent_dmat);	/* tag */
2901
2902	if (err != 0) {
2903		device_printf(sc->dev, "Err %d allocating parent dmat\n",
2904			      err);
2905		goto abort_with_nothing;
2906	}
2907
2908	ifp = sc->ifp = if_alloc(IFT_ETHER);
2909	if (ifp == NULL) {
2910		device_printf(dev, "can not if_alloc()\n");
2911		err = ENOSPC;
2912		goto abort_with_parent_dmat;
2913	}
2914	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
2915		 device_get_nameunit(dev));
2916	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
2917	snprintf(sc->tx_mtx_name, sizeof(sc->tx_mtx_name), "%s:tx",
2918		 device_get_nameunit(dev));
2919	mtx_init(&sc->tx_mtx, sc->tx_mtx_name, NULL, MTX_DEF);
2920	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
2921		 "%s:drv", device_get_nameunit(dev));
2922	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
2923		 MTX_NETWORK_LOCK, MTX_DEF);
2924
2925	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
2926
2927	mxge_setup_cfg_space(sc);
2928
2929	/* Map the board into the kernel */
2930	rid = PCIR_BARS;
2931	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
2932					 ~0, 1, RF_ACTIVE);
2933	if (sc->mem_res == NULL) {
2934		device_printf(dev, "could not map memory\n");
2935		err = ENXIO;
2936		goto abort_with_lock;
2937	}
2938	sc->sram = rman_get_virtual(sc->mem_res);
2939	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
2940	if (sc->sram_size > rman_get_size(sc->mem_res)) {
2941		device_printf(dev, "impossible memory region size %ld\n",
2942			      rman_get_size(sc->mem_res));
2943		err = ENXIO;
2944		goto abort_with_mem_res;
2945	}
2946
2947	/* make NULL terminated copy of the EEPROM strings section of
2948	   lanai SRAM */
2949	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
2950	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
2951				rman_get_bushandle(sc->mem_res),
2952				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
2953				sc->eeprom_strings,
2954				MXGE_EEPROM_STRINGS_SIZE - 2);
2955	err = mxge_parse_strings(sc);
2956	if (err != 0)
2957		goto abort_with_mem_res;
2958
2959	/* Enable write combining for efficient use of PCIe bus */
2960	mxge_enable_wc(sc);
2961
2962	/* Allocate the out of band dma memory */
2963	err = mxge_dma_alloc(sc, &sc->cmd_dma,
2964			     sizeof (mxge_cmd_t), 64);
2965	if (err != 0)
2966		goto abort_with_mem_res;
2967	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
2968	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
2969	if (err != 0)
2970		goto abort_with_cmd_dma;
2971
2972	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
2973			     sizeof (*sc->fw_stats), 64);
2974	if (err != 0)
2975		goto abort_with_zeropad_dma;
2976	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
2977
2978	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
2979	if (err != 0)
2980		goto abort_with_fw_stats;
2981
2982	/* allocate interrupt queues */
2983	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);
2984	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
2985	if (err != 0)
2986		goto abort_with_dmabench;
2987	sc->rx_done.entry = sc->rx_done.dma.addr;
2988	bzero(sc->rx_done.entry, bytes);
2989
2990	/* Add our ithread  */
2991	count = pci_msi_count(dev);
2992	if (count == 1 && pci_alloc_msi(dev, &count) == 0) {
2993		rid = 1;
2994		sc->msi_enabled = 1;
2995	} else {
2996		rid = 0;
2997	}
2998	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
2999					 1, RF_SHAREABLE | RF_ACTIVE);
3000	if (sc->irq_res == NULL) {
3001		device_printf(dev, "could not alloc interrupt\n");
3002		goto abort_with_rx_done;
3003	}
3004	if (mxge_verbose)
3005		device_printf(dev, "using %s irq %ld\n",
3006			      sc->msi_enabled ? "MSI" : "INTx",
3007			      rman_get_start(sc->irq_res));
3008	/* load the firmware */
3009	mxge_select_firmware(sc);
3010
3011	err = mxge_load_firmware(sc);
3012	if (err != 0)
3013		goto abort_with_irq_res;
3014	sc->intr_coal_delay = mxge_intr_coal_delay;
3015	err = mxge_reset(sc);
3016	if (err != 0)
3017		goto abort_with_irq_res;
3018
3019	err = mxge_alloc_rings(sc);
3020	if (err != 0) {
3021		device_printf(sc->dev, "failed to allocate rings\n");
3022		goto abort_with_irq_res;
3023	}
3024
3025	err = bus_setup_intr(sc->dev, sc->irq_res,
3026			     INTR_TYPE_NET | INTR_MPSAFE,
3027			     mxge_intr, sc, &sc->ih);
3028	if (err != 0) {
3029		goto abort_with_rings;
3030	}
3031	/* hook into the network stack */
3032	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
3033	ifp->if_baudrate = 100000000;
3034	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
3035		IFCAP_JUMBO_MTU;
3036	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
3037	ifp->if_capenable = ifp->if_capabilities;
3038	sc->csum_flag = 1;
3039        ifp->if_init = mxge_init;
3040        ifp->if_softc = sc;
3041        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
3042        ifp->if_ioctl = mxge_ioctl;
3043        ifp->if_start = mxge_start;
3044	ether_ifattach(ifp, sc->mac_addr);
3045	/* ether_ifattach sets mtu to 1500 */
3046	ifp->if_mtu = MXGE_MAX_ETHER_MTU - ETHER_HDR_LEN;
3047
3048	/* Initialise the ifmedia structure */
3049	ifmedia_init(&sc->media, 0, mxge_media_change,
3050		     mxge_media_status);
3051	ifmedia_add(&sc->media, IFM_ETHER|IFM_AUTO, 0, NULL);
3052	mxge_add_sysctls(sc);
3053	return 0;
3054
3055abort_with_rings:
3056	mxge_free_rings(sc);
3057abort_with_irq_res:
3058	bus_release_resource(dev, SYS_RES_IRQ,
3059			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3060	if (sc->msi_enabled)
3061		pci_release_msi(dev);
3062abort_with_rx_done:
3063	sc->rx_done.entry = NULL;
3064	mxge_dma_free(&sc->rx_done.dma);
3065abort_with_dmabench:
3066	mxge_dma_free(&sc->dmabench_dma);
3067abort_with_fw_stats:
3068	mxge_dma_free(&sc->fw_stats_dma);
3069abort_with_zeropad_dma:
3070	mxge_dma_free(&sc->zeropad_dma);
3071abort_with_cmd_dma:
3072	mxge_dma_free(&sc->cmd_dma);
3073abort_with_mem_res:
3074	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3075abort_with_lock:
3076	pci_disable_busmaster(dev);
3077	mtx_destroy(&sc->cmd_mtx);
3078	mtx_destroy(&sc->tx_mtx);
3079	mtx_destroy(&sc->driver_mtx);
3080	if_free(ifp);
3081abort_with_parent_dmat:
3082	bus_dma_tag_destroy(sc->parent_dmat);
3083
3084abort_with_nothing:
3085	return err;
3086}
3087
3088static int
3089mxge_detach(device_t dev)
3090{
3091	mxge_softc_t *sc = device_get_softc(dev);
3092
3093	mtx_lock(&sc->driver_mtx);
3094	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
3095		mxge_close(sc);
3096	callout_stop(&sc->co_hdl);
3097	mtx_unlock(&sc->driver_mtx);
3098	ether_ifdetach(sc->ifp);
3099	ifmedia_removeall(&sc->media);
3100	mxge_dummy_rdma(sc, 0);
3101	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
3102	mxge_free_rings(sc);
3103	bus_release_resource(dev, SYS_RES_IRQ,
3104			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3105	if (sc->msi_enabled)
3106		pci_release_msi(dev);
3107
3108	sc->rx_done.entry = NULL;
3109	mxge_dma_free(&sc->rx_done.dma);
3110	mxge_dma_free(&sc->fw_stats_dma);
3111	mxge_dma_free(&sc->dmabench_dma);
3112	mxge_dma_free(&sc->zeropad_dma);
3113	mxge_dma_free(&sc->cmd_dma);
3114	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3115	pci_disable_busmaster(dev);
3116	mtx_destroy(&sc->cmd_mtx);
3117	mtx_destroy(&sc->tx_mtx);
3118	mtx_destroy(&sc->driver_mtx);
3119	if_free(sc->ifp);
3120	bus_dma_tag_destroy(sc->parent_dmat);
3121	return 0;
3122}
3123
3124static int
3125mxge_shutdown(device_t dev)
3126{
3127	return 0;
3128}
3129
3130/*
3131  This file uses Myri10GE driver indentation.
3132
3133  Local Variables:
3134  c-file-style:"linux"
3135  tab-width:8
3136  End:
3137*/
3138