if_mxge.c revision 198250
1/******************************************************************************
2
3Copyright (c) 2006-2009, Myricom Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Myricom Inc, nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/mxge/if_mxge.c 198250 2009-10-19 20:51:27Z gallatin $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/linker.h>
36#include <sys/firmware.h>
37#include <sys/endian.h>
38#include <sys/sockio.h>
39#include <sys/mbuf.h>
40#include <sys/malloc.h>
41#include <sys/kdb.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/module.h>
45#include <sys/socket.h>
46#include <sys/sysctl.h>
47#include <sys/sx.h>
48#include <sys/taskqueue.h>
49
50/* count xmits ourselves, rather than via drbr */
51#define NO_SLOW_STATS
52#include <net/if.h>
53#include <net/if_arp.h>
54#include <net/ethernet.h>
55#include <net/if_dl.h>
56#include <net/if_media.h>
57
58#include <net/bpf.h>
59
60#include <net/if_types.h>
61#include <net/if_vlan_var.h>
62#include <net/zlib.h>
63
64#include <netinet/in_systm.h>
65#include <netinet/in.h>
66#include <netinet/ip.h>
67#include <netinet/tcp.h>
68
69#include <machine/bus.h>
70#include <machine/in_cksum.h>
71#include <machine/resource.h>
72#include <sys/bus.h>
73#include <sys/rman.h>
74#include <sys/smp.h>
75
76#include <dev/pci/pcireg.h>
77#include <dev/pci/pcivar.h>
78#include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
79
80#include <vm/vm.h>		/* for pmap_mapdev() */
81#include <vm/pmap.h>
82
83#if defined(__i386) || defined(__amd64)
84#include <machine/specialreg.h>
85#endif
86
87#include <dev/mxge/mxge_mcp.h>
88#include <dev/mxge/mcp_gen_header.h>
89/*#define MXGE_FAKE_IFP*/
90#include <dev/mxge/if_mxge_var.h>
91#ifdef IFNET_BUF_RING
92#include <sys/buf_ring.h>
93#endif
94
95#include "opt_inet.h"
96
97/* tunable params */
98static int mxge_nvidia_ecrc_enable = 1;
99static int mxge_force_firmware = 0;
100static int mxge_intr_coal_delay = 30;
101static int mxge_deassert_wait = 1;
102static int mxge_flow_control = 1;
103static int mxge_verbose = 0;
104static int mxge_lro_cnt = 8;
105static int mxge_ticks;
106static int mxge_max_slices = 1;
107static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
108static int mxge_always_promisc = 0;
109static int mxge_initial_mtu = ETHERMTU_JUMBO;
110static int mxge_throttle = 0;
111static char *mxge_fw_unaligned = "mxge_ethp_z8e";
112static char *mxge_fw_aligned = "mxge_eth_z8e";
113static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
114static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
115
116static int mxge_probe(device_t dev);
117static int mxge_attach(device_t dev);
118static int mxge_detach(device_t dev);
119static int mxge_shutdown(device_t dev);
120static void mxge_intr(void *arg);
121
122static device_method_t mxge_methods[] =
123{
124  /* Device interface */
125  DEVMETHOD(device_probe, mxge_probe),
126  DEVMETHOD(device_attach, mxge_attach),
127  DEVMETHOD(device_detach, mxge_detach),
128  DEVMETHOD(device_shutdown, mxge_shutdown),
129  {0, 0}
130};
131
132static driver_t mxge_driver =
133{
134  "mxge",
135  mxge_methods,
136  sizeof(mxge_softc_t),
137};
138
139static devclass_t mxge_devclass;
140
141/* Declare ourselves to be a child of the PCI bus.*/
142DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
143MODULE_DEPEND(mxge, firmware, 1, 1, 1);
144MODULE_DEPEND(mxge, zlib, 1, 1, 1);
145
146static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
147static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
148static int mxge_close(mxge_softc_t *sc, int down);
149static int mxge_open(mxge_softc_t *sc);
150static void mxge_tick(void *arg);
151
152static int
153mxge_probe(device_t dev)
154{
155	int rev;
156
157
158	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
159	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
160	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
161		rev = pci_get_revid(dev);
162		switch (rev) {
163		case MXGE_PCI_REV_Z8E:
164			device_set_desc(dev, "Myri10G-PCIE-8A");
165			break;
166		case MXGE_PCI_REV_Z8ES:
167			device_set_desc(dev, "Myri10G-PCIE-8B");
168			break;
169		default:
170			device_set_desc(dev, "Myri10G-PCIE-8??");
171			device_printf(dev, "Unrecognized rev %d NIC\n",
172				      rev);
173			break;
174		}
175		return 0;
176	}
177	return ENXIO;
178}
179
180static void
181mxge_enable_wc(mxge_softc_t *sc)
182{
183#if defined(__i386) || defined(__amd64)
184	vm_offset_t len;
185	int err;
186
187	sc->wc = 1;
188	len = rman_get_size(sc->mem_res);
189	err = pmap_change_attr((vm_offset_t) sc->sram,
190			       len, PAT_WRITE_COMBINING);
191	if (err != 0) {
192		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193			      err);
194		sc->wc = 0;
195	}
196#endif
197}
198
199
200/* callback to get our DMA address */
201static void
202mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
203			 int error)
204{
205	if (error == 0) {
206		*(bus_addr_t *) arg = segs->ds_addr;
207	}
208}
209
210static int
211mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
212		   bus_size_t alignment)
213{
214	int err;
215	device_t dev = sc->dev;
216	bus_size_t boundary, maxsegsize;
217
218	if (bytes > 4096 && alignment == 4096) {
219		boundary = 0;
220		maxsegsize = bytes;
221	} else {
222		boundary = 4096;
223		maxsegsize = 4096;
224	}
225
226	/* allocate DMAable memory tags */
227	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
228				 alignment,		/* alignment */
229				 boundary,		/* boundary */
230				 BUS_SPACE_MAXADDR,	/* low */
231				 BUS_SPACE_MAXADDR,	/* high */
232				 NULL, NULL,		/* filter */
233				 bytes,			/* maxsize */
234				 1,			/* num segs */
235				 maxsegsize,		/* maxsegsize */
236				 BUS_DMA_COHERENT,	/* flags */
237				 NULL, NULL,		/* lock */
238				 &dma->dmat);		/* tag */
239	if (err != 0) {
240		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
241		return err;
242	}
243
244	/* allocate DMAable memory & map */
245	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
246			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
247				| BUS_DMA_ZERO),  &dma->map);
248	if (err != 0) {
249		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
250		goto abort_with_dmat;
251	}
252
253	/* load the memory */
254	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
255			      mxge_dmamap_callback,
256			      (void *)&dma->bus_addr, 0);
257	if (err != 0) {
258		device_printf(dev, "couldn't load map (err = %d)\n", err);
259		goto abort_with_mem;
260	}
261	return 0;
262
263abort_with_mem:
264	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
265abort_with_dmat:
266	(void)bus_dma_tag_destroy(dma->dmat);
267	return err;
268}
269
270
271static void
272mxge_dma_free(mxge_dma_t *dma)
273{
274	bus_dmamap_unload(dma->dmat, dma->map);
275	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
276	(void)bus_dma_tag_destroy(dma->dmat);
277}
278
279/*
280 * The eeprom strings on the lanaiX have the format
281 * SN=x\0
282 * MAC=x:x:x:x:x:x\0
283 * PC=text\0
284 */
285
286static int
287mxge_parse_strings(mxge_softc_t *sc)
288{
289#define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
290
291	char *ptr, *limit;
292	int i, found_mac;
293
294	ptr = sc->eeprom_strings;
295	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
296	found_mac = 0;
297	while (ptr < limit && *ptr != '\0') {
298		if (memcmp(ptr, "MAC=", 4) == 0) {
299			ptr += 1;
300			sc->mac_addr_string = ptr;
301			for (i = 0; i < 6; i++) {
302				ptr += 3;
303				if ((ptr + 2) > limit)
304					goto abort;
305				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
306				found_mac = 1;
307			}
308		} else if (memcmp(ptr, "PC=", 3) == 0) {
309			ptr += 3;
310			strncpy(sc->product_code_string, ptr,
311				sizeof (sc->product_code_string) - 1);
312		} else if (memcmp(ptr, "SN=", 3) == 0) {
313			ptr += 3;
314			strncpy(sc->serial_number_string, ptr,
315				sizeof (sc->serial_number_string) - 1);
316		}
317		MXGE_NEXT_STRING(ptr);
318	}
319
320	if (found_mac)
321		return 0;
322
323 abort:
324	device_printf(sc->dev, "failed to parse eeprom_strings\n");
325
326	return ENXIO;
327}
328
329#if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
330static void
331mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
332{
333	uint32_t val;
334	unsigned long base, off;
335	char *va, *cfgptr;
336	device_t pdev, mcp55;
337	uint16_t vendor_id, device_id, word;
338	uintptr_t bus, slot, func, ivend, idev;
339	uint32_t *ptr32;
340
341
342	if (!mxge_nvidia_ecrc_enable)
343		return;
344
345	pdev = device_get_parent(device_get_parent(sc->dev));
346	if (pdev == NULL) {
347		device_printf(sc->dev, "could not find parent?\n");
348		return;
349	}
350	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
351	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
352
353	if (vendor_id != 0x10de)
354		return;
355
356	base = 0;
357
358	if (device_id == 0x005d) {
359		/* ck804, base address is magic */
360		base = 0xe0000000UL;
361	} else if (device_id >= 0x0374 && device_id <= 0x378) {
362		/* mcp55, base address stored in chipset */
363		mcp55 = pci_find_bsf(0, 0, 0);
364		if (mcp55 &&
365		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
366		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
367			word = pci_read_config(mcp55, 0x90, 2);
368			base = ((unsigned long)word & 0x7ffeU) << 25;
369		}
370	}
371	if (!base)
372		return;
373
374	/* XXXX
375	   Test below is commented because it is believed that doing
376	   config read/write beyond 0xff will access the config space
377	   for the next larger function.  Uncomment this and remove
378	   the hacky pmap_mapdev() way of accessing config space when
379	   FreeBSD grows support for extended pcie config space access
380	*/
381#if 0
382	/* See if we can, by some miracle, access the extended
383	   config space */
384	val = pci_read_config(pdev, 0x178, 4);
385	if (val != 0xffffffff) {
386		val |= 0x40;
387		pci_write_config(pdev, 0x178, val, 4);
388		return;
389	}
390#endif
391	/* Rather than using normal pci config space writes, we must
392	 * map the Nvidia config space ourselves.  This is because on
393	 * opteron/nvidia class machine the 0xe000000 mapping is
394	 * handled by the nvidia chipset, that means the internal PCI
395	 * device (the on-chip northbridge), or the amd-8131 bridge
396	 * and things behind them are not visible by this method.
397	 */
398
399	BUS_READ_IVAR(device_get_parent(pdev), pdev,
400		      PCI_IVAR_BUS, &bus);
401	BUS_READ_IVAR(device_get_parent(pdev), pdev,
402		      PCI_IVAR_SLOT, &slot);
403	BUS_READ_IVAR(device_get_parent(pdev), pdev,
404		      PCI_IVAR_FUNCTION, &func);
405	BUS_READ_IVAR(device_get_parent(pdev), pdev,
406		      PCI_IVAR_VENDOR, &ivend);
407	BUS_READ_IVAR(device_get_parent(pdev), pdev,
408		      PCI_IVAR_DEVICE, &idev);
409
410	off =  base
411		+ 0x00100000UL * (unsigned long)bus
412		+ 0x00001000UL * (unsigned long)(func
413						 + 8 * slot);
414
415	/* map it into the kernel */
416	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
417
418
419	if (va == NULL) {
420		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
421		return;
422	}
423	/* get a pointer to the config space mapped into the kernel */
424	cfgptr = va + (off & PAGE_MASK);
425
426	/* make sure that we can really access it */
427	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
428	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
429	if (! (vendor_id == ivend && device_id == idev)) {
430		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
431			      vendor_id, device_id);
432		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
433		return;
434	}
435
436	ptr32 = (uint32_t*)(cfgptr + 0x178);
437	val = *ptr32;
438
439	if (val == 0xffffffff) {
440		device_printf(sc->dev, "extended mapping failed\n");
441		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
442		return;
443	}
444	*ptr32 = val | 0x40;
445	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
446	if (mxge_verbose)
447		device_printf(sc->dev,
448			      "Enabled ECRC on upstream Nvidia bridge "
449			      "at %d:%d:%d\n",
450			      (int)bus, (int)slot, (int)func);
451	return;
452}
453#else
454static void
455mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
456{
457	device_printf(sc->dev,
458		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
459	return;
460}
461#endif
462
463
464static int
465mxge_dma_test(mxge_softc_t *sc, int test_type)
466{
467	mxge_cmd_t cmd;
468	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
469	int status;
470	uint32_t len;
471	char *test = " ";
472
473
474	/* Run a small DMA test.
475	 * The magic multipliers to the length tell the firmware
476	 * to do DMA read, write, or read+write tests.  The
477	 * results are returned in cmd.data0.  The upper 16
478	 * bits of the return is the number of transfers completed.
479	 * The lower 16 bits is the time in 0.5us ticks that the
480	 * transfers took to complete.
481	 */
482
483	len = sc->tx_boundary;
484
485	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
486	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
487	cmd.data2 = len * 0x10000;
488	status = mxge_send_cmd(sc, test_type, &cmd);
489	if (status != 0) {
490		test = "read";
491		goto abort;
492	}
493	sc->read_dma = ((cmd.data0>>16) * len * 2) /
494		(cmd.data0 & 0xffff);
495	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
496	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
497	cmd.data2 = len * 0x1;
498	status = mxge_send_cmd(sc, test_type, &cmd);
499	if (status != 0) {
500		test = "write";
501		goto abort;
502	}
503	sc->write_dma = ((cmd.data0>>16) * len * 2) /
504		(cmd.data0 & 0xffff);
505
506	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508	cmd.data2 = len * 0x10001;
509	status = mxge_send_cmd(sc, test_type, &cmd);
510	if (status != 0) {
511		test = "read/write";
512		goto abort;
513	}
514	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
515		(cmd.data0 & 0xffff);
516
517abort:
518	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
519		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
520			      test, status);
521
522	return status;
523}
524
525/*
526 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
527 * when the PCI-E Completion packets are aligned on an 8-byte
528 * boundary.  Some PCI-E chip sets always align Completion packets; on
529 * the ones that do not, the alignment can be enforced by enabling
530 * ECRC generation (if supported).
531 *
532 * When PCI-E Completion packets are not aligned, it is actually more
533 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
534 *
535 * If the driver can neither enable ECRC nor verify that it has
536 * already been enabled, then it must use a firmware image which works
537 * around unaligned completion packets (ethp_z8e.dat), and it should
538 * also ensure that it never gives the device a Read-DMA which is
539 * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
540 * enabled, then the driver should use the aligned (eth_z8e.dat)
541 * firmware image, and set tx_boundary to 4KB.
542 */
543
544static int
545mxge_firmware_probe(mxge_softc_t *sc)
546{
547	device_t dev = sc->dev;
548	int reg, status;
549	uint16_t pectl;
550
551	sc->tx_boundary = 4096;
552	/*
553	 * Verify the max read request size was set to 4KB
554	 * before trying the test with 4KB.
555	 */
556	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
557		pectl = pci_read_config(dev, reg + 0x8, 2);
558		if ((pectl & (5 << 12)) != (5 << 12)) {
559			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
560				      pectl);
561			sc->tx_boundary = 2048;
562		}
563	}
564
565	/*
566	 * load the optimized firmware (which assumes aligned PCIe
567	 * completions) in order to see if it works on this host.
568	 */
569	sc->fw_name = mxge_fw_aligned;
570	status = mxge_load_firmware(sc, 1);
571	if (status != 0) {
572		return status;
573	}
574
575	/*
576	 * Enable ECRC if possible
577	 */
578	mxge_enable_nvidia_ecrc(sc);
579
580	/*
581	 * Run a DMA test which watches for unaligned completions and
582	 * aborts on the first one seen.
583	 */
584
585	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
586	if (status == 0)
587		return 0; /* keep the aligned firmware */
588
589	if (status != E2BIG)
590		device_printf(dev, "DMA test failed: %d\n", status);
591	if (status == ENOSYS)
592		device_printf(dev, "Falling back to ethp! "
593			      "Please install up to date fw\n");
594	return status;
595}
596
597static int
598mxge_select_firmware(mxge_softc_t *sc)
599{
600	int aligned = 0;
601	int force_firmware = mxge_force_firmware;
602
603	if (sc->throttle)
604		force_firmware = sc->throttle;
605
606	if (force_firmware != 0) {
607		if (force_firmware == 1)
608			aligned = 1;
609		else
610			aligned = 0;
611		if (mxge_verbose)
612			device_printf(sc->dev,
613				      "Assuming %s completions (forced)\n",
614				      aligned ? "aligned" : "unaligned");
615		goto abort;
616	}
617
618	/* if the PCIe link width is 4 or less, we can use the aligned
619	   firmware and skip any checks */
620	if (sc->link_width != 0 && sc->link_width <= 4) {
621		device_printf(sc->dev,
622			      "PCIe x%d Link, expect reduced performance\n",
623			      sc->link_width);
624		aligned = 1;
625		goto abort;
626	}
627
628	if (0 == mxge_firmware_probe(sc))
629		return 0;
630
631abort:
632	if (aligned) {
633		sc->fw_name = mxge_fw_aligned;
634		sc->tx_boundary = 4096;
635	} else {
636		sc->fw_name = mxge_fw_unaligned;
637		sc->tx_boundary = 2048;
638	}
639	return (mxge_load_firmware(sc, 0));
640}
641
642union qualhack
643{
644        const char *ro_char;
645        char *rw_char;
646};
647
648static int
649mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
650{
651
652
653	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
654		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
655			      be32toh(hdr->mcp_type));
656		return EIO;
657	}
658
659	/* save firmware version for sysctl */
660	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
661	if (mxge_verbose)
662		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
663
664	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
665	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
666
667	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
668	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
669		device_printf(sc->dev, "Found firmware version %s\n",
670			      sc->fw_version);
671		device_printf(sc->dev, "Driver needs %d.%d\n",
672			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
673		return EINVAL;
674	}
675	return 0;
676
677}
678
679static void *
680z_alloc(void *nil, u_int items, u_int size)
681{
682        void *ptr;
683
684        ptr = malloc(items * size, M_TEMP, M_NOWAIT);
685        return ptr;
686}
687
688static void
689z_free(void *nil, void *ptr)
690{
691        free(ptr, M_TEMP);
692}
693
694
695static int
696mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
697{
698	z_stream zs;
699	char *inflate_buffer;
700	const struct firmware *fw;
701	const mcp_gen_header_t *hdr;
702	unsigned hdr_offset;
703	int status;
704	unsigned int i;
705	char dummy;
706	size_t fw_len;
707
708	fw = firmware_get(sc->fw_name);
709	if (fw == NULL) {
710		device_printf(sc->dev, "Could not find firmware image %s\n",
711			      sc->fw_name);
712		return ENOENT;
713	}
714
715
716
717	/* setup zlib and decompress f/w */
718	bzero(&zs, sizeof (zs));
719	zs.zalloc = z_alloc;
720	zs.zfree = z_free;
721	status = inflateInit(&zs);
722	if (status != Z_OK) {
723		status = EIO;
724		goto abort_with_fw;
725	}
726
727	/* the uncompressed size is stored as the firmware version,
728	   which would otherwise go unused */
729	fw_len = (size_t) fw->version;
730	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
731	if (inflate_buffer == NULL)
732		goto abort_with_zs;
733	zs.avail_in = fw->datasize;
734	zs.next_in = __DECONST(char *, fw->data);
735	zs.avail_out = fw_len;
736	zs.next_out = inflate_buffer;
737	status = inflate(&zs, Z_FINISH);
738	if (status != Z_STREAM_END) {
739		device_printf(sc->dev, "zlib %d\n", status);
740		status = EIO;
741		goto abort_with_buffer;
742	}
743
744	/* check id */
745	hdr_offset = htobe32(*(const uint32_t *)
746			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
747	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
748		device_printf(sc->dev, "Bad firmware file");
749		status = EIO;
750		goto abort_with_buffer;
751	}
752	hdr = (const void*)(inflate_buffer + hdr_offset);
753
754	status = mxge_validate_firmware(sc, hdr);
755	if (status != 0)
756		goto abort_with_buffer;
757
758	/* Copy the inflated firmware to NIC SRAM. */
759	for (i = 0; i < fw_len; i += 256) {
760		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
761			      inflate_buffer + i,
762			      min(256U, (unsigned)(fw_len - i)));
763		wmb();
764		dummy = *sc->sram;
765		wmb();
766	}
767
768	*limit = fw_len;
769	status = 0;
770abort_with_buffer:
771	free(inflate_buffer, M_TEMP);
772abort_with_zs:
773	inflateEnd(&zs);
774abort_with_fw:
775	firmware_put(fw, FIRMWARE_UNLOAD);
776	return status;
777}
778
779/*
780 * Enable or disable periodic RDMAs from the host to make certain
781 * chipsets resend dropped PCIe messages
782 */
783
784static void
785mxge_dummy_rdma(mxge_softc_t *sc, int enable)
786{
787	char buf_bytes[72];
788	volatile uint32_t *confirm;
789	volatile char *submit;
790	uint32_t *buf, dma_low, dma_high;
791	int i;
792
793	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
794
795	/* clear confirmation addr */
796	confirm = (volatile uint32_t *)sc->cmd;
797	*confirm = 0;
798	wmb();
799
800	/* send an rdma command to the PCIe engine, and wait for the
801	   response in the confirmation address.  The firmware should
802	   write a -1 there to indicate it is alive and well
803	*/
804
805	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
806	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
807	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
808	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
809	buf[2] = htobe32(0xffffffff);		/* confirm data */
810	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
811	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
812	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
813	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
814	buf[5] = htobe32(enable);			/* enable? */
815
816
817	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
818
819	mxge_pio_copy(submit, buf, 64);
820	wmb();
821	DELAY(1000);
822	wmb();
823	i = 0;
824	while (*confirm != 0xffffffff && i < 20) {
825		DELAY(1000);
826		i++;
827	}
828	if (*confirm != 0xffffffff) {
829		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
830			      (enable ? "enable" : "disable"), confirm,
831			      *confirm);
832	}
833	return;
834}
835
836static int
837mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
838{
839	mcp_cmd_t *buf;
840	char buf_bytes[sizeof(*buf) + 8];
841	volatile mcp_cmd_response_t *response = sc->cmd;
842	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
843	uint32_t dma_low, dma_high;
844	int err, sleep_total = 0;
845
846	/* ensure buf is aligned to 8 bytes */
847	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
848
849	buf->data0 = htobe32(data->data0);
850	buf->data1 = htobe32(data->data1);
851	buf->data2 = htobe32(data->data2);
852	buf->cmd = htobe32(cmd);
853	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
854	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
855
856	buf->response_addr.low = htobe32(dma_low);
857	buf->response_addr.high = htobe32(dma_high);
858	mtx_lock(&sc->cmd_mtx);
859	response->result = 0xffffffff;
860	wmb();
861	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
862
863	/* wait up to 20ms */
864	err = EAGAIN;
865	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
866		bus_dmamap_sync(sc->cmd_dma.dmat,
867				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
868		wmb();
869		switch (be32toh(response->result)) {
870		case 0:
871			data->data0 = be32toh(response->data);
872			err = 0;
873			break;
874		case 0xffffffff:
875			DELAY(1000);
876			break;
877		case MXGEFW_CMD_UNKNOWN:
878			err = ENOSYS;
879			break;
880		case MXGEFW_CMD_ERROR_UNALIGNED:
881			err = E2BIG;
882			break;
883		case MXGEFW_CMD_ERROR_BUSY:
884			err = EBUSY;
885			break;
886		default:
887			device_printf(sc->dev,
888				      "mxge: command %d "
889				      "failed, result = %d\n",
890				      cmd, be32toh(response->result));
891			err = ENXIO;
892			break;
893		}
894		if (err != EAGAIN)
895			break;
896	}
897	if (err == EAGAIN)
898		device_printf(sc->dev, "mxge: command %d timed out"
899			      "result = %d\n",
900			      cmd, be32toh(response->result));
901	mtx_unlock(&sc->cmd_mtx);
902	return err;
903}
904
905static int
906mxge_adopt_running_firmware(mxge_softc_t *sc)
907{
908	struct mcp_gen_header *hdr;
909	const size_t bytes = sizeof (struct mcp_gen_header);
910	size_t hdr_offset;
911	int status;
912
913	/* find running firmware header */
914	hdr_offset = htobe32(*(volatile uint32_t *)
915			     (sc->sram + MCP_HEADER_PTR_OFFSET));
916
917	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
918		device_printf(sc->dev,
919			      "Running firmware has bad header offset (%d)\n",
920			      (int)hdr_offset);
921		return EIO;
922	}
923
924	/* copy header of running firmware from SRAM to host memory to
925	 * validate firmware */
926	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
927	if (hdr == NULL) {
928		device_printf(sc->dev, "could not malloc firmware hdr\n");
929		return ENOMEM;
930	}
931	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
932				rman_get_bushandle(sc->mem_res),
933				hdr_offset, (char *)hdr, bytes);
934	status = mxge_validate_firmware(sc, hdr);
935	free(hdr, M_DEVBUF);
936
937	/*
938	 * check to see if adopted firmware has bug where adopting
939	 * it will cause broadcasts to be filtered unless the NIC
940	 * is kept in ALLMULTI mode
941	 */
942	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
943	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
944		sc->adopted_rx_filter_bug = 1;
945		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
946			      "working around rx filter bug\n",
947			      sc->fw_ver_major, sc->fw_ver_minor,
948			      sc->fw_ver_tiny);
949	}
950
951	return status;
952}
953
954
955static int
956mxge_load_firmware(mxge_softc_t *sc, int adopt)
957{
958	volatile uint32_t *confirm;
959	volatile char *submit;
960	char buf_bytes[72];
961	uint32_t *buf, size, dma_low, dma_high;
962	int status, i;
963
964	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
965
966	size = sc->sram_size;
967	status = mxge_load_firmware_helper(sc, &size);
968	if (status) {
969		if (!adopt)
970			return status;
971		/* Try to use the currently running firmware, if
972		   it is new enough */
973		status = mxge_adopt_running_firmware(sc);
974		if (status) {
975			device_printf(sc->dev,
976				      "failed to adopt running firmware\n");
977			return status;
978		}
979		device_printf(sc->dev,
980			      "Successfully adopted running firmware\n");
981		if (sc->tx_boundary == 4096) {
982			device_printf(sc->dev,
983				"Using firmware currently running on NIC"
984				 ".  For optimal\n");
985			device_printf(sc->dev,
986				 "performance consider loading optimized "
987				 "firmware\n");
988		}
989		sc->fw_name = mxge_fw_unaligned;
990		sc->tx_boundary = 2048;
991		return 0;
992	}
993	/* clear confirmation addr */
994	confirm = (volatile uint32_t *)sc->cmd;
995	*confirm = 0;
996	wmb();
997	/* send a reload command to the bootstrap MCP, and wait for the
998	   response in the confirmation address.  The firmware should
999	   write a -1 there to indicate it is alive and well
1000	*/
1001
1002	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1003	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1004
1005	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1006	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1007	buf[2] = htobe32(0xffffffff);	/* confirm data */
1008
1009	/* FIX: All newest firmware should un-protect the bottom of
1010	   the sram before handoff. However, the very first interfaces
1011	   do not. Therefore the handoff copy must skip the first 8 bytes
1012	*/
1013					/* where the code starts*/
1014	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1015	buf[4] = htobe32(size - 8); 	/* length of code */
1016	buf[5] = htobe32(8);		/* where to copy to */
1017	buf[6] = htobe32(0);		/* where to jump to */
1018
1019	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1020	mxge_pio_copy(submit, buf, 64);
1021	wmb();
1022	DELAY(1000);
1023	wmb();
1024	i = 0;
1025	while (*confirm != 0xffffffff && i < 20) {
1026		DELAY(1000*10);
1027		i++;
1028		bus_dmamap_sync(sc->cmd_dma.dmat,
1029				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1030	}
1031	if (*confirm != 0xffffffff) {
1032		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1033			confirm, *confirm);
1034
1035		return ENXIO;
1036	}
1037	return 0;
1038}
1039
1040static int
1041mxge_update_mac_address(mxge_softc_t *sc)
1042{
1043	mxge_cmd_t cmd;
1044	uint8_t *addr = sc->mac_addr;
1045	int status;
1046
1047
1048	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1049		     | (addr[2] << 8) | addr[3]);
1050
1051	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1052
1053	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1054	return status;
1055}
1056
1057static int
1058mxge_change_pause(mxge_softc_t *sc, int pause)
1059{
1060	mxge_cmd_t cmd;
1061	int status;
1062
1063	if (pause)
1064		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1065				       &cmd);
1066	else
1067		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1068				       &cmd);
1069
1070	if (status) {
1071		device_printf(sc->dev, "Failed to set flow control mode\n");
1072		return ENXIO;
1073	}
1074	sc->pause = pause;
1075	return 0;
1076}
1077
1078static void
1079mxge_change_promisc(mxge_softc_t *sc, int promisc)
1080{
1081	mxge_cmd_t cmd;
1082	int status;
1083
1084	if (mxge_always_promisc)
1085		promisc = 1;
1086
1087	if (promisc)
1088		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1089				       &cmd);
1090	else
1091		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1092				       &cmd);
1093
1094	if (status) {
1095		device_printf(sc->dev, "Failed to set promisc mode\n");
1096	}
1097}
1098
1099static void
1100mxge_set_multicast_list(mxge_softc_t *sc)
1101{
1102	mxge_cmd_t cmd;
1103	struct ifmultiaddr *ifma;
1104	struct ifnet *ifp = sc->ifp;
1105	int err;
1106
1107	/* This firmware is known to not support multicast */
1108	if (!sc->fw_multicast_support)
1109		return;
1110
1111	/* Disable multicast filtering while we play with the lists*/
1112	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1113	if (err != 0) {
1114		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1115		       " error status: %d\n", err);
1116		return;
1117	}
1118
1119	if (sc->adopted_rx_filter_bug)
1120		return;
1121
1122	if (ifp->if_flags & IFF_ALLMULTI)
1123		/* request to disable multicast filtering, so quit here */
1124		return;
1125
1126	/* Flush all the filters */
1127
1128	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1129	if (err != 0) {
1130		device_printf(sc->dev,
1131			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1132			      ", error status: %d\n", err);
1133		return;
1134	}
1135
1136	/* Walk the multicast list, and add each address */
1137
1138	if_maddr_rlock(ifp);
1139	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1140		if (ifma->ifma_addr->sa_family != AF_LINK)
1141			continue;
1142		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1143		      &cmd.data0, 4);
1144		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1145		      &cmd.data1, 2);
1146		cmd.data0 = htonl(cmd.data0);
1147		cmd.data1 = htonl(cmd.data1);
1148		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1149		if (err != 0) {
1150			device_printf(sc->dev, "Failed "
1151			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1152			       "%d\t", err);
1153			/* abort, leaving multicast filtering off */
1154			if_maddr_runlock(ifp);
1155			return;
1156		}
1157	}
1158	if_maddr_runlock(ifp);
1159	/* Enable multicast filtering */
1160	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1161	if (err != 0) {
1162		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1163		       ", error status: %d\n", err);
1164	}
1165}
1166
1167static int
1168mxge_max_mtu(mxge_softc_t *sc)
1169{
1170	mxge_cmd_t cmd;
1171	int status;
1172
1173	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1174		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1175
1176	/* try to set nbufs to see if it we can
1177	   use virtually contiguous jumbos */
1178	cmd.data0 = 0;
1179	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1180			       &cmd);
1181	if (status == 0)
1182		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1183
1184	/* otherwise, we're limited to MJUMPAGESIZE */
1185	return MJUMPAGESIZE - MXGEFW_PAD;
1186}
1187
1188static int
1189mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1190{
1191	struct mxge_slice_state *ss;
1192	mxge_rx_done_t *rx_done;
1193	volatile uint32_t *irq_claim;
1194	mxge_cmd_t cmd;
1195	int slice, status;
1196
1197	/* try to send a reset command to the card to see if it
1198	   is alive */
1199	memset(&cmd, 0, sizeof (cmd));
1200	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1201	if (status != 0) {
1202		device_printf(sc->dev, "failed reset\n");
1203		return ENXIO;
1204	}
1205
1206	mxge_dummy_rdma(sc, 1);
1207
1208
1209	/* set the intrq size */
1210	cmd.data0 = sc->rx_ring_size;
1211	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1212
1213	/*
1214	 * Even though we already know how many slices are supported
1215	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1216	 * has magic side effects, and must be called after a reset.
1217	 * It must be called prior to calling any RSS related cmds,
1218	 * including assigning an interrupt queue for anything but
1219	 * slice 0.  It must also be called *after*
1220	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1221	 * the firmware to compute offsets.
1222	 */
1223
1224	if (sc->num_slices > 1) {
1225		/* ask the maximum number of slices it supports */
1226		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1227					   &cmd);
1228		if (status != 0) {
1229			device_printf(sc->dev,
1230				      "failed to get number of slices\n");
1231			return status;
1232		}
1233		/*
1234		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1235		 * to setting up the interrupt queue DMA
1236		 */
1237		cmd.data0 = sc->num_slices;
1238		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1239#ifdef IFNET_BUF_RING
1240		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1241#endif
1242		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1243					   &cmd);
1244		if (status != 0) {
1245			device_printf(sc->dev,
1246				      "failed to set number of slices\n");
1247			return status;
1248		}
1249	}
1250
1251
1252	if (interrupts_setup) {
1253		/* Now exchange information about interrupts  */
1254		for (slice = 0; slice < sc->num_slices; slice++) {
1255			rx_done = &sc->ss[slice].rx_done;
1256			memset(rx_done->entry, 0, sc->rx_ring_size);
1257			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1258			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1259			cmd.data2 = slice;
1260			status |= mxge_send_cmd(sc,
1261						MXGEFW_CMD_SET_INTRQ_DMA,
1262						&cmd);
1263		}
1264	}
1265
1266	status |= mxge_send_cmd(sc,
1267				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1268
1269
1270	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1271
1272	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1273	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1274
1275
1276	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1277				&cmd);
1278	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1279	if (status != 0) {
1280		device_printf(sc->dev, "failed set interrupt parameters\n");
1281		return status;
1282	}
1283
1284
1285	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1286
1287
1288	/* run a DMA benchmark */
1289	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1290
1291	for (slice = 0; slice < sc->num_slices; slice++) {
1292		ss = &sc->ss[slice];
1293
1294		ss->irq_claim = irq_claim + (2 * slice);
1295		/* reset mcp/driver shared state back to 0 */
1296		ss->rx_done.idx = 0;
1297		ss->rx_done.cnt = 0;
1298		ss->tx.req = 0;
1299		ss->tx.done = 0;
1300		ss->tx.pkt_done = 0;
1301		ss->tx.queue_active = 0;
1302		ss->tx.activate = 0;
1303		ss->tx.deactivate = 0;
1304		ss->tx.wake = 0;
1305		ss->tx.defrag = 0;
1306		ss->tx.stall = 0;
1307		ss->rx_big.cnt = 0;
1308		ss->rx_small.cnt = 0;
1309		ss->lro_bad_csum = 0;
1310		ss->lro_queued = 0;
1311		ss->lro_flushed = 0;
1312		if (ss->fw_stats != NULL) {
1313			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1314		}
1315	}
1316	sc->rdma_tags_available = 15;
1317	status = mxge_update_mac_address(sc);
1318	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1319	mxge_change_pause(sc, sc->pause);
1320	mxge_set_multicast_list(sc);
1321	if (sc->throttle) {
1322		cmd.data0 = sc->throttle;
1323		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1324				  &cmd)) {
1325			device_printf(sc->dev,
1326				      "can't enable throttle\n");
1327		}
1328	}
1329	return status;
1330}
1331
1332static int
1333mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1334{
1335	mxge_cmd_t cmd;
1336	mxge_softc_t *sc;
1337	int err;
1338	unsigned int throttle;
1339
1340	sc = arg1;
1341	throttle = sc->throttle;
1342	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1343        if (err != 0) {
1344                return err;
1345        }
1346
1347	if (throttle == sc->throttle)
1348		return 0;
1349
1350        if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1351                return EINVAL;
1352
1353	mtx_lock(&sc->driver_mtx);
1354	cmd.data0 = throttle;
1355	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1356	if (err == 0)
1357		sc->throttle = throttle;
1358	mtx_unlock(&sc->driver_mtx);
1359	return err;
1360}
1361
1362static int
1363mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1364{
1365        mxge_softc_t *sc;
1366        unsigned int intr_coal_delay;
1367        int err;
1368
1369        sc = arg1;
1370        intr_coal_delay = sc->intr_coal_delay;
1371        err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1372        if (err != 0) {
1373                return err;
1374        }
1375        if (intr_coal_delay == sc->intr_coal_delay)
1376                return 0;
1377
1378        if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1379                return EINVAL;
1380
1381	mtx_lock(&sc->driver_mtx);
1382	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1383	sc->intr_coal_delay = intr_coal_delay;
1384
1385	mtx_unlock(&sc->driver_mtx);
1386        return err;
1387}
1388
1389static int
1390mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1391{
1392        mxge_softc_t *sc;
1393        unsigned int enabled;
1394        int err;
1395
1396        sc = arg1;
1397        enabled = sc->pause;
1398        err = sysctl_handle_int(oidp, &enabled, arg2, req);
1399        if (err != 0) {
1400                return err;
1401        }
1402        if (enabled == sc->pause)
1403                return 0;
1404
1405	mtx_lock(&sc->driver_mtx);
1406	err = mxge_change_pause(sc, enabled);
1407	mtx_unlock(&sc->driver_mtx);
1408        return err;
1409}
1410
1411static int
1412mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1413{
1414	struct ifnet *ifp;
1415	int err = 0;
1416
1417	ifp = sc->ifp;
1418	if (lro_cnt == 0)
1419		ifp->if_capenable &= ~IFCAP_LRO;
1420	else
1421		ifp->if_capenable |= IFCAP_LRO;
1422	sc->lro_cnt = lro_cnt;
1423	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1424		mxge_close(sc, 0);
1425		err = mxge_open(sc);
1426	}
1427	return err;
1428}
1429
1430static int
1431mxge_change_lro(SYSCTL_HANDLER_ARGS)
1432{
1433	mxge_softc_t *sc;
1434	unsigned int lro_cnt;
1435	int err;
1436
1437	sc = arg1;
1438	lro_cnt = sc->lro_cnt;
1439	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1440	if (err != 0)
1441		return err;
1442
1443	if (lro_cnt == sc->lro_cnt)
1444		return 0;
1445
1446	if (lro_cnt > 128)
1447		return EINVAL;
1448
1449	mtx_lock(&sc->driver_mtx);
1450	err = mxge_change_lro_locked(sc, lro_cnt);
1451	mtx_unlock(&sc->driver_mtx);
1452	return err;
1453}
1454
1455static int
1456mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1457{
1458        int err;
1459
1460        if (arg1 == NULL)
1461                return EFAULT;
1462        arg2 = be32toh(*(int *)arg1);
1463        arg1 = NULL;
1464        err = sysctl_handle_int(oidp, arg1, arg2, req);
1465
1466        return err;
1467}
1468
1469static void
1470mxge_rem_sysctls(mxge_softc_t *sc)
1471{
1472	struct mxge_slice_state *ss;
1473	int slice;
1474
1475	if (sc->slice_sysctl_tree == NULL)
1476		return;
1477
1478	for (slice = 0; slice < sc->num_slices; slice++) {
1479		ss = &sc->ss[slice];
1480		if (ss == NULL || ss->sysctl_tree == NULL)
1481			continue;
1482		sysctl_ctx_free(&ss->sysctl_ctx);
1483		ss->sysctl_tree = NULL;
1484	}
1485	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1486	sc->slice_sysctl_tree = NULL;
1487}
1488
1489static void
1490mxge_add_sysctls(mxge_softc_t *sc)
1491{
1492	struct sysctl_ctx_list *ctx;
1493	struct sysctl_oid_list *children;
1494	mcp_irq_data_t *fw;
1495	struct mxge_slice_state *ss;
1496	int slice;
1497	char slice_num[8];
1498
1499	ctx = device_get_sysctl_ctx(sc->dev);
1500	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1501	fw = sc->ss[0].fw_stats;
1502
1503	/* random information */
1504	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1505		       "firmware_version",
1506		       CTLFLAG_RD, &sc->fw_version,
1507		       0, "firmware version");
1508	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1509		       "serial_number",
1510		       CTLFLAG_RD, &sc->serial_number_string,
1511		       0, "serial number");
1512	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1513		       "product_code",
1514		       CTLFLAG_RD, &sc->product_code_string,
1515		       0, "product_code");
1516	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1517		       "pcie_link_width",
1518		       CTLFLAG_RD, &sc->link_width,
1519		       0, "tx_boundary");
1520	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1521		       "tx_boundary",
1522		       CTLFLAG_RD, &sc->tx_boundary,
1523		       0, "tx_boundary");
1524	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1525		       "write_combine",
1526		       CTLFLAG_RD, &sc->wc,
1527		       0, "write combining PIO?");
1528	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1529		       "read_dma_MBs",
1530		       CTLFLAG_RD, &sc->read_dma,
1531		       0, "DMA Read speed in MB/s");
1532	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1533		       "write_dma_MBs",
1534		       CTLFLAG_RD, &sc->write_dma,
1535		       0, "DMA Write speed in MB/s");
1536	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1537		       "read_write_dma_MBs",
1538		       CTLFLAG_RD, &sc->read_write_dma,
1539		       0, "DMA concurrent Read/Write speed in MB/s");
1540	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1541		       "watchdog_resets",
1542		       CTLFLAG_RD, &sc->watchdog_resets,
1543		       0, "Number of times NIC was reset");
1544
1545
1546	/* performance related tunables */
1547	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548			"intr_coal_delay",
1549			CTLTYPE_INT|CTLFLAG_RW, sc,
1550			0, mxge_change_intr_coal,
1551			"I", "interrupt coalescing delay in usecs");
1552
1553	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554			"throttle",
1555			CTLTYPE_INT|CTLFLAG_RW, sc,
1556			0, mxge_change_throttle,
1557			"I", "transmit throttling");
1558
1559	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560			"flow_control_enabled",
1561			CTLTYPE_INT|CTLFLAG_RW, sc,
1562			0, mxge_change_flow_control,
1563			"I", "interrupt coalescing delay in usecs");
1564
1565	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1566		       "deassert_wait",
1567		       CTLFLAG_RW, &mxge_deassert_wait,
1568		       0, "Wait for IRQ line to go low in ihandler");
1569
1570	/* stats block from firmware is in network byte order.
1571	   Need to swap it */
1572	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1573			"link_up",
1574			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1575			0, mxge_handle_be32,
1576			"I", "link up");
1577	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1578			"rdma_tags_available",
1579			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1580			0, mxge_handle_be32,
1581			"I", "rdma_tags_available");
1582	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583			"dropped_bad_crc32",
1584			CTLTYPE_INT|CTLFLAG_RD,
1585			&fw->dropped_bad_crc32,
1586			0, mxge_handle_be32,
1587			"I", "dropped_bad_crc32");
1588	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1589			"dropped_bad_phy",
1590			CTLTYPE_INT|CTLFLAG_RD,
1591			&fw->dropped_bad_phy,
1592			0, mxge_handle_be32,
1593			"I", "dropped_bad_phy");
1594	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1595			"dropped_link_error_or_filtered",
1596			CTLTYPE_INT|CTLFLAG_RD,
1597			&fw->dropped_link_error_or_filtered,
1598			0, mxge_handle_be32,
1599			"I", "dropped_link_error_or_filtered");
1600	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1601			"dropped_link_overflow",
1602			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1603			0, mxge_handle_be32,
1604			"I", "dropped_link_overflow");
1605	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1606			"dropped_multicast_filtered",
1607			CTLTYPE_INT|CTLFLAG_RD,
1608			&fw->dropped_multicast_filtered,
1609			0, mxge_handle_be32,
1610			"I", "dropped_multicast_filtered");
1611	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1612			"dropped_no_big_buffer",
1613			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1614			0, mxge_handle_be32,
1615			"I", "dropped_no_big_buffer");
1616	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1617			"dropped_no_small_buffer",
1618			CTLTYPE_INT|CTLFLAG_RD,
1619			&fw->dropped_no_small_buffer,
1620			0, mxge_handle_be32,
1621			"I", "dropped_no_small_buffer");
1622	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1623			"dropped_overrun",
1624			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1625			0, mxge_handle_be32,
1626			"I", "dropped_overrun");
1627	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1628			"dropped_pause",
1629			CTLTYPE_INT|CTLFLAG_RD,
1630			&fw->dropped_pause,
1631			0, mxge_handle_be32,
1632			"I", "dropped_pause");
1633	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1634			"dropped_runt",
1635			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1636			0, mxge_handle_be32,
1637			"I", "dropped_runt");
1638
1639	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1640			"dropped_unicast_filtered",
1641			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1642			0, mxge_handle_be32,
1643			"I", "dropped_unicast_filtered");
1644
1645	/* verbose printing? */
1646	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1647		       "verbose",
1648		       CTLFLAG_RW, &mxge_verbose,
1649		       0, "verbose printing");
1650
1651	/* lro */
1652	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1653			"lro_cnt",
1654			CTLTYPE_INT|CTLFLAG_RW, sc,
1655			0, mxge_change_lro,
1656			"I", "number of lro merge queues");
1657
1658
1659	/* add counters exported for debugging from all slices */
1660	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1661	sc->slice_sysctl_tree =
1662		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1663				"slice", CTLFLAG_RD, 0, "");
1664
1665	for (slice = 0; slice < sc->num_slices; slice++) {
1666		ss = &sc->ss[slice];
1667		sysctl_ctx_init(&ss->sysctl_ctx);
1668		ctx = &ss->sysctl_ctx;
1669		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1670		sprintf(slice_num, "%d", slice);
1671		ss->sysctl_tree =
1672			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1673					CTLFLAG_RD, 0, "");
1674		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1675		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1676			       "rx_small_cnt",
1677			       CTLFLAG_RD, &ss->rx_small.cnt,
1678			       0, "rx_small_cnt");
1679		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1680			       "rx_big_cnt",
1681			       CTLFLAG_RD, &ss->rx_big.cnt,
1682			       0, "rx_small_cnt");
1683		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1684			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1685			       0, "number of lro merge queues flushed");
1686
1687		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1688			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1689			       0, "number of frames appended to lro merge"
1690			       "queues");
1691
1692#ifndef IFNET_BUF_RING
1693		/* only transmit from slice 0 for now */
1694		if (slice > 0)
1695			continue;
1696#endif
1697		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1698			       "tx_req",
1699			       CTLFLAG_RD, &ss->tx.req,
1700			       0, "tx_req");
1701
1702		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1703			       "tx_done",
1704			       CTLFLAG_RD, &ss->tx.done,
1705			       0, "tx_done");
1706		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1707			       "tx_pkt_done",
1708			       CTLFLAG_RD, &ss->tx.pkt_done,
1709			       0, "tx_done");
1710		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1711			       "tx_stall",
1712			       CTLFLAG_RD, &ss->tx.stall,
1713			       0, "tx_stall");
1714		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1715			       "tx_wake",
1716			       CTLFLAG_RD, &ss->tx.wake,
1717			       0, "tx_wake");
1718		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1719			       "tx_defrag",
1720			       CTLFLAG_RD, &ss->tx.defrag,
1721			       0, "tx_defrag");
1722		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1723			       "tx_queue_active",
1724			       CTLFLAG_RD, &ss->tx.queue_active,
1725			       0, "tx_queue_active");
1726		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1727			       "tx_activate",
1728			       CTLFLAG_RD, &ss->tx.activate,
1729			       0, "tx_activate");
1730		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1731			       "tx_deactivate",
1732			       CTLFLAG_RD, &ss->tx.deactivate,
1733			       0, "tx_deactivate");
1734	}
1735}
1736
1737/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1738   backwards one at a time and handle ring wraps */
1739
1740static inline void
1741mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1742			    mcp_kreq_ether_send_t *src, int cnt)
1743{
1744        int idx, starting_slot;
1745        starting_slot = tx->req;
1746        while (cnt > 1) {
1747                cnt--;
1748                idx = (starting_slot + cnt) & tx->mask;
1749                mxge_pio_copy(&tx->lanai[idx],
1750			      &src[cnt], sizeof(*src));
1751                wmb();
1752        }
1753}
1754
1755/*
1756 * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1757 * at most 32 bytes at a time, so as to avoid involving the software
1758 * pio handler in the nic.   We re-write the first segment's flags
1759 * to mark them valid only after writing the entire chain
1760 */
1761
1762static inline void
1763mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1764                  int cnt)
1765{
1766        int idx, i;
1767        uint32_t *src_ints;
1768	volatile uint32_t *dst_ints;
1769        mcp_kreq_ether_send_t *srcp;
1770	volatile mcp_kreq_ether_send_t *dstp, *dst;
1771	uint8_t last_flags;
1772
1773        idx = tx->req & tx->mask;
1774
1775	last_flags = src->flags;
1776	src->flags = 0;
1777        wmb();
1778        dst = dstp = &tx->lanai[idx];
1779        srcp = src;
1780
1781        if ((idx + cnt) < tx->mask) {
1782                for (i = 0; i < (cnt - 1); i += 2) {
1783                        mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1784                        wmb(); /* force write every 32 bytes */
1785                        srcp += 2;
1786                        dstp += 2;
1787                }
1788        } else {
1789                /* submit all but the first request, and ensure
1790                   that it is submitted below */
1791                mxge_submit_req_backwards(tx, src, cnt);
1792                i = 0;
1793        }
1794        if (i < cnt) {
1795                /* submit the first request */
1796                mxge_pio_copy(dstp, srcp, sizeof(*src));
1797                wmb(); /* barrier before setting valid flag */
1798        }
1799
1800        /* re-write the last 32-bits with the valid flags */
1801        src->flags = last_flags;
1802        src_ints = (uint32_t *)src;
1803        src_ints+=3;
1804        dst_ints = (volatile uint32_t *)dst;
1805        dst_ints+=3;
1806        *dst_ints =  *src_ints;
1807        tx->req += cnt;
1808        wmb();
1809}
1810
1811#if IFCAP_TSO4
1812
1813static void
1814mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1815	       int busdma_seg_cnt, int ip_off)
1816{
1817	mxge_tx_ring_t *tx;
1818	mcp_kreq_ether_send_t *req;
1819	bus_dma_segment_t *seg;
1820	struct ip *ip;
1821	struct tcphdr *tcp;
1822	uint32_t low, high_swapped;
1823	int len, seglen, cum_len, cum_len_next;
1824	int next_is_first, chop, cnt, rdma_count, small;
1825	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1826	uint8_t flags, flags_next;
1827	static int once;
1828
1829	mss = m->m_pkthdr.tso_segsz;
1830
1831	/* negative cum_len signifies to the
1832	 * send loop that we are still in the
1833	 * header portion of the TSO packet.
1834	 */
1835
1836	/* ensure we have the ethernet, IP and TCP
1837	   header together in the first mbuf, copy
1838	   it to a scratch buffer if not */
1839	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1840		m_copydata(m, 0, ip_off + sizeof (*ip),
1841			   ss->scratch);
1842		ip = (struct ip *)(ss->scratch + ip_off);
1843	} else {
1844		ip = (struct ip *)(mtod(m, char *) + ip_off);
1845	}
1846	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1847			    + sizeof (*tcp))) {
1848		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1849			   + sizeof (*tcp),  ss->scratch);
1850		ip = (struct ip *)(mtod(m, char *) + ip_off);
1851	}
1852
1853	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1854	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1855
1856	/* TSO implies checksum offload on this hardware */
1857	cksum_offset = ip_off + (ip->ip_hl << 2);
1858	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1859
1860
1861	/* for TSO, pseudo_hdr_offset holds mss.
1862	 * The firmware figures out where to put
1863	 * the checksum by parsing the header. */
1864	pseudo_hdr_offset = htobe16(mss);
1865
1866	tx = &ss->tx;
1867	req = tx->req_list;
1868	seg = tx->seg_list;
1869	cnt = 0;
1870	rdma_count = 0;
1871	/* "rdma_count" is the number of RDMAs belonging to the
1872	 * current packet BEFORE the current send request. For
1873	 * non-TSO packets, this is equal to "count".
1874	 * For TSO packets, rdma_count needs to be reset
1875	 * to 0 after a segment cut.
1876	 *
1877	 * The rdma_count field of the send request is
1878	 * the number of RDMAs of the packet starting at
1879	 * that request. For TSO send requests with one ore more cuts
1880	 * in the middle, this is the number of RDMAs starting
1881	 * after the last cut in the request. All previous
1882	 * segments before the last cut implicitly have 1 RDMA.
1883	 *
1884	 * Since the number of RDMAs is not known beforehand,
1885	 * it must be filled-in retroactively - after each
1886	 * segmentation cut or at the end of the entire packet.
1887	 */
1888
1889	while (busdma_seg_cnt) {
1890		/* Break the busdma segment up into pieces*/
1891		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1892		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1893		len = seg->ds_len;
1894
1895		while (len) {
1896			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1897			seglen = len;
1898			cum_len_next = cum_len + seglen;
1899			(req-rdma_count)->rdma_count = rdma_count + 1;
1900			if (__predict_true(cum_len >= 0)) {
1901				/* payload */
1902				chop = (cum_len_next > mss);
1903				cum_len_next = cum_len_next % mss;
1904				next_is_first = (cum_len_next == 0);
1905				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1906				flags_next |= next_is_first *
1907					MXGEFW_FLAGS_FIRST;
1908				rdma_count |= -(chop | next_is_first);
1909				rdma_count += chop & !next_is_first;
1910			} else if (cum_len_next >= 0) {
1911				/* header ends */
1912				rdma_count = -1;
1913				cum_len_next = 0;
1914				seglen = -cum_len;
1915				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1916				flags_next = MXGEFW_FLAGS_TSO_PLD |
1917					MXGEFW_FLAGS_FIRST |
1918					(small * MXGEFW_FLAGS_SMALL);
1919			    }
1920
1921			req->addr_high = high_swapped;
1922			req->addr_low = htobe32(low);
1923			req->pseudo_hdr_offset = pseudo_hdr_offset;
1924			req->pad = 0;
1925			req->rdma_count = 1;
1926			req->length = htobe16(seglen);
1927			req->cksum_offset = cksum_offset;
1928			req->flags = flags | ((cum_len & 1) *
1929					      MXGEFW_FLAGS_ALIGN_ODD);
1930			low += seglen;
1931			len -= seglen;
1932			cum_len = cum_len_next;
1933			flags = flags_next;
1934			req++;
1935			cnt++;
1936			rdma_count++;
1937			if (__predict_false(cksum_offset > seglen))
1938				cksum_offset -= seglen;
1939			else
1940				cksum_offset = 0;
1941			if (__predict_false(cnt > tx->max_desc))
1942				goto drop;
1943		}
1944		busdma_seg_cnt--;
1945		seg++;
1946	}
1947	(req-rdma_count)->rdma_count = rdma_count;
1948
1949	do {
1950		req--;
1951		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1952	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1953
1954	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1955	mxge_submit_req(tx, tx->req_list, cnt);
1956#ifdef IFNET_BUF_RING
1957	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1958		/* tell the NIC to start polling this slice */
1959		*tx->send_go = 1;
1960		tx->queue_active = 1;
1961		tx->activate++;
1962		wmb();
1963	}
1964#endif
1965	return;
1966
1967drop:
1968	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1969	m_freem(m);
1970	ss->oerrors++;
1971	if (!once) {
1972		printf("tx->max_desc exceeded via TSO!\n");
1973		printf("mss = %d, %ld, %d!\n", mss,
1974		       (long)seg - (long)tx->seg_list, tx->max_desc);
1975		once = 1;
1976	}
1977	return;
1978
1979}
1980
1981#endif /* IFCAP_TSO4 */
1982
1983#ifdef MXGE_NEW_VLAN_API
1984/*
1985 * We reproduce the software vlan tag insertion from
1986 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1987 * vlan tag insertion. We need to advertise this in order to have the
1988 * vlan interface respect our csum offload flags.
1989 */
1990static struct mbuf *
1991mxge_vlan_tag_insert(struct mbuf *m)
1992{
1993	struct ether_vlan_header *evl;
1994
1995	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1996	if (__predict_false(m == NULL))
1997		return NULL;
1998	if (m->m_len < sizeof(*evl)) {
1999		m = m_pullup(m, sizeof(*evl));
2000		if (__predict_false(m == NULL))
2001			return NULL;
2002	}
2003	/*
2004	 * Transform the Ethernet header into an Ethernet header
2005	 * with 802.1Q encapsulation.
2006	 */
2007	evl = mtod(m, struct ether_vlan_header *);
2008	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2009	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2010	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2011	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2012	m->m_flags &= ~M_VLANTAG;
2013	return m;
2014}
2015#endif /* MXGE_NEW_VLAN_API */
2016
2017static void
2018mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2019{
2020	mxge_softc_t *sc;
2021	mcp_kreq_ether_send_t *req;
2022	bus_dma_segment_t *seg;
2023	struct mbuf *m_tmp;
2024	struct ifnet *ifp;
2025	mxge_tx_ring_t *tx;
2026	struct ip *ip;
2027	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2028	uint16_t pseudo_hdr_offset;
2029        uint8_t flags, cksum_offset;
2030
2031
2032	sc = ss->sc;
2033	ifp = sc->ifp;
2034	tx = &ss->tx;
2035
2036	ip_off = sizeof (struct ether_header);
2037#ifdef MXGE_NEW_VLAN_API
2038	if (m->m_flags & M_VLANTAG) {
2039		m = mxge_vlan_tag_insert(m);
2040		if (__predict_false(m == NULL))
2041			goto drop;
2042		ip_off += ETHER_VLAN_ENCAP_LEN;
2043	}
2044#endif
2045	/* (try to) map the frame for DMA */
2046	idx = tx->req & tx->mask;
2047	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2048				      m, tx->seg_list, &cnt,
2049				      BUS_DMA_NOWAIT);
2050	if (__predict_false(err == EFBIG)) {
2051		/* Too many segments in the chain.  Try
2052		   to defrag */
2053		m_tmp = m_defrag(m, M_NOWAIT);
2054		if (m_tmp == NULL) {
2055			goto drop;
2056		}
2057		ss->tx.defrag++;
2058		m = m_tmp;
2059		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2060					      tx->info[idx].map,
2061					      m, tx->seg_list, &cnt,
2062					      BUS_DMA_NOWAIT);
2063	}
2064	if (__predict_false(err != 0)) {
2065		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2066			      " packet len = %d\n", err, m->m_pkthdr.len);
2067		goto drop;
2068	}
2069	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2070			BUS_DMASYNC_PREWRITE);
2071	tx->info[idx].m = m;
2072
2073#if IFCAP_TSO4
2074	/* TSO is different enough, we handle it in another routine */
2075	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2076		mxge_encap_tso(ss, m, cnt, ip_off);
2077		return;
2078	}
2079#endif
2080
2081	req = tx->req_list;
2082	cksum_offset = 0;
2083	pseudo_hdr_offset = 0;
2084	flags = MXGEFW_FLAGS_NO_TSO;
2085
2086	/* checksum offloading? */
2087	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2088		/* ensure ip header is in first mbuf, copy
2089		   it to a scratch buffer if not */
2090		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2091			m_copydata(m, 0, ip_off + sizeof (*ip),
2092				   ss->scratch);
2093			ip = (struct ip *)(ss->scratch + ip_off);
2094		} else {
2095			ip = (struct ip *)(mtod(m, char *) + ip_off);
2096		}
2097		cksum_offset = ip_off + (ip->ip_hl << 2);
2098		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2099		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2100		req->cksum_offset = cksum_offset;
2101		flags |= MXGEFW_FLAGS_CKSUM;
2102		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2103	} else {
2104		odd_flag = 0;
2105	}
2106	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2107		flags |= MXGEFW_FLAGS_SMALL;
2108
2109	/* convert segments into a request list */
2110	cum_len = 0;
2111	seg = tx->seg_list;
2112	req->flags = MXGEFW_FLAGS_FIRST;
2113	for (i = 0; i < cnt; i++) {
2114		req->addr_low =
2115			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2116		req->addr_high =
2117			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2118		req->length = htobe16(seg->ds_len);
2119		req->cksum_offset = cksum_offset;
2120		if (cksum_offset > seg->ds_len)
2121			cksum_offset -= seg->ds_len;
2122		else
2123			cksum_offset = 0;
2124		req->pseudo_hdr_offset = pseudo_hdr_offset;
2125		req->pad = 0; /* complete solid 16-byte block */
2126		req->rdma_count = 1;
2127		req->flags |= flags | ((cum_len & 1) * odd_flag);
2128		cum_len += seg->ds_len;
2129		seg++;
2130		req++;
2131		req->flags = 0;
2132	}
2133	req--;
2134	/* pad runts to 60 bytes */
2135	if (cum_len < 60) {
2136		req++;
2137		req->addr_low =
2138			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2139		req->addr_high =
2140			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2141		req->length = htobe16(60 - cum_len);
2142		req->cksum_offset = 0;
2143		req->pseudo_hdr_offset = pseudo_hdr_offset;
2144		req->pad = 0; /* complete solid 16-byte block */
2145		req->rdma_count = 1;
2146		req->flags |= flags | ((cum_len & 1) * odd_flag);
2147		cnt++;
2148	}
2149
2150	tx->req_list[0].rdma_count = cnt;
2151#if 0
2152	/* print what the firmware will see */
2153	for (i = 0; i < cnt; i++) {
2154		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2155		    "cso:%d, flags:0x%x, rdma:%d\n",
2156		    i, (int)ntohl(tx->req_list[i].addr_high),
2157		    (int)ntohl(tx->req_list[i].addr_low),
2158		    (int)ntohs(tx->req_list[i].length),
2159		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2160		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2161		    tx->req_list[i].rdma_count);
2162	}
2163	printf("--------------\n");
2164#endif
2165	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2166	mxge_submit_req(tx, tx->req_list, cnt);
2167#ifdef IFNET_BUF_RING
2168	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2169		/* tell the NIC to start polling this slice */
2170		*tx->send_go = 1;
2171		tx->queue_active = 1;
2172		tx->activate++;
2173		wmb();
2174	}
2175#endif
2176	return;
2177
2178drop:
2179	m_freem(m);
2180	ss->oerrors++;
2181	return;
2182}
2183
2184#ifdef IFNET_BUF_RING
2185static void
2186mxge_qflush(struct ifnet *ifp)
2187{
2188	mxge_softc_t *sc = ifp->if_softc;
2189	mxge_tx_ring_t *tx;
2190	struct mbuf *m;
2191	int slice;
2192
2193	for (slice = 0; slice < sc->num_slices; slice++) {
2194		tx = &sc->ss[slice].tx;
2195		mtx_lock(&tx->mtx);
2196		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2197			m_freem(m);
2198		mtx_unlock(&tx->mtx);
2199	}
2200	if_qflush(ifp);
2201}
2202
2203static inline void
2204mxge_start_locked(struct mxge_slice_state *ss)
2205{
2206	mxge_softc_t *sc;
2207	struct mbuf *m;
2208	struct ifnet *ifp;
2209	mxge_tx_ring_t *tx;
2210
2211	sc = ss->sc;
2212	ifp = sc->ifp;
2213	tx = &ss->tx;
2214
2215	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2216		m = drbr_dequeue(ifp, tx->br);
2217		if (m == NULL) {
2218			return;
2219		}
2220		/* let BPF see it */
2221		BPF_MTAP(ifp, m);
2222
2223		/* give it to the nic */
2224		mxge_encap(ss, m);
2225	}
2226	/* ran out of transmit slots */
2227	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2228	    && (!drbr_empty(ifp, tx->br))) {
2229		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2230		tx->stall++;
2231	}
2232}
2233
2234static int
2235mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2236{
2237	mxge_softc_t *sc;
2238	struct ifnet *ifp;
2239	mxge_tx_ring_t *tx;
2240	int err;
2241
2242	sc = ss->sc;
2243	ifp = sc->ifp;
2244	tx = &ss->tx;
2245
2246	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2247	    IFF_DRV_RUNNING) {
2248		err = drbr_enqueue(ifp, tx->br, m);
2249		return (err);
2250	}
2251
2252	if (drbr_empty(ifp, tx->br) &&
2253	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2254		/* let BPF see it */
2255		BPF_MTAP(ifp, m);
2256		/* give it to the nic */
2257		mxge_encap(ss, m);
2258	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2259		return (err);
2260	}
2261	if (!drbr_empty(ifp, tx->br))
2262		mxge_start_locked(ss);
2263	return (0);
2264}
2265
2266static int
2267mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2268{
2269	mxge_softc_t *sc = ifp->if_softc;
2270	struct mxge_slice_state *ss;
2271	mxge_tx_ring_t *tx;
2272	int err = 0;
2273	int slice;
2274
2275	slice = m->m_pkthdr.flowid;
2276	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2277
2278	ss = &sc->ss[slice];
2279	tx = &ss->tx;
2280
2281	if (mtx_trylock(&tx->mtx)) {
2282		err = mxge_transmit_locked(ss, m);
2283		mtx_unlock(&tx->mtx);
2284	} else {
2285		err = drbr_enqueue(ifp, tx->br, m);
2286	}
2287
2288	return (err);
2289}
2290
2291#else
2292
2293static inline void
2294mxge_start_locked(struct mxge_slice_state *ss)
2295{
2296	mxge_softc_t *sc;
2297	struct mbuf *m;
2298	struct ifnet *ifp;
2299	mxge_tx_ring_t *tx;
2300
2301	sc = ss->sc;
2302	ifp = sc->ifp;
2303	tx = &ss->tx;
2304	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2305		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2306		if (m == NULL) {
2307			return;
2308		}
2309		/* let BPF see it */
2310		BPF_MTAP(ifp, m);
2311
2312		/* give it to the nic */
2313		mxge_encap(ss, m);
2314	}
2315	/* ran out of transmit slots */
2316	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2317		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2318		tx->stall++;
2319	}
2320}
2321#endif
2322static void
2323mxge_start(struct ifnet *ifp)
2324{
2325	mxge_softc_t *sc = ifp->if_softc;
2326	struct mxge_slice_state *ss;
2327
2328	/* only use the first slice for now */
2329	ss = &sc->ss[0];
2330	mtx_lock(&ss->tx.mtx);
2331	mxge_start_locked(ss);
2332	mtx_unlock(&ss->tx.mtx);
2333}
2334
2335/*
2336 * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2337 * at most 32 bytes at a time, so as to avoid involving the software
2338 * pio handler in the nic.   We re-write the first segment's low
2339 * DMA address to mark it valid only after we write the entire chunk
2340 * in a burst
2341 */
2342static inline void
2343mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2344		mcp_kreq_ether_recv_t *src)
2345{
2346	uint32_t low;
2347
2348	low = src->addr_low;
2349	src->addr_low = 0xffffffff;
2350	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2351	wmb();
2352	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2353	wmb();
2354	src->addr_low = low;
2355	dst->addr_low = low;
2356	wmb();
2357}
2358
2359static int
2360mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2361{
2362	bus_dma_segment_t seg;
2363	struct mbuf *m;
2364	mxge_rx_ring_t *rx = &ss->rx_small;
2365	int cnt, err;
2366
2367	m = m_gethdr(M_DONTWAIT, MT_DATA);
2368	if (m == NULL) {
2369		rx->alloc_fail++;
2370		err = ENOBUFS;
2371		goto done;
2372	}
2373	m->m_len = MHLEN;
2374	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2375				      &seg, &cnt, BUS_DMA_NOWAIT);
2376	if (err != 0) {
2377		m_free(m);
2378		goto done;
2379	}
2380	rx->info[idx].m = m;
2381	rx->shadow[idx].addr_low =
2382		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2383	rx->shadow[idx].addr_high =
2384		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2385
2386done:
2387	if ((idx & 7) == 7)
2388		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2389	return err;
2390}
2391
2392static int
2393mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2394{
2395	bus_dma_segment_t seg[3];
2396	struct mbuf *m;
2397	mxge_rx_ring_t *rx = &ss->rx_big;
2398	int cnt, err, i;
2399
2400	if (rx->cl_size == MCLBYTES)
2401		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2402	else
2403		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2404	if (m == NULL) {
2405		rx->alloc_fail++;
2406		err = ENOBUFS;
2407		goto done;
2408	}
2409	m->m_len = rx->mlen;
2410	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2411				      seg, &cnt, BUS_DMA_NOWAIT);
2412	if (err != 0) {
2413		m_free(m);
2414		goto done;
2415	}
2416	rx->info[idx].m = m;
2417	rx->shadow[idx].addr_low =
2418		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2419	rx->shadow[idx].addr_high =
2420		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2421
2422#if MXGE_VIRT_JUMBOS
2423	for (i = 1; i < cnt; i++) {
2424		rx->shadow[idx + i].addr_low =
2425			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2426		rx->shadow[idx + i].addr_high =
2427			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2428       }
2429#endif
2430
2431done:
2432       for (i = 0; i < rx->nbufs; i++) {
2433		if ((idx & 7) == 7) {
2434			mxge_submit_8rx(&rx->lanai[idx - 7],
2435					&rx->shadow[idx - 7]);
2436		}
2437		idx++;
2438	}
2439	return err;
2440}
2441
2442/*
2443 *  Myri10GE hardware checksums are not valid if the sender
2444 *  padded the frame with non-zero padding.  This is because
2445 *  the firmware just does a simple 16-bit 1s complement
2446 *  checksum across the entire frame, excluding the first 14
2447 *  bytes.  It is best to simply to check the checksum and
2448 *  tell the stack about it only if the checksum is good
2449 */
2450
2451static inline uint16_t
2452mxge_rx_csum(struct mbuf *m, int csum)
2453{
2454	struct ether_header *eh;
2455	struct ip *ip;
2456	uint16_t c;
2457
2458	eh = mtod(m, struct ether_header *);
2459
2460	/* only deal with IPv4 TCP & UDP for now */
2461	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2462		return 1;
2463	ip = (struct ip *)(eh + 1);
2464	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2465			    ip->ip_p != IPPROTO_UDP))
2466		return 1;
2467#ifdef INET
2468	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2469		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2470			    - (ip->ip_hl << 2) + ip->ip_p));
2471#else
2472	c = 1;
2473#endif
2474	c ^= 0xffff;
2475	return (c);
2476}
2477
2478static void
2479mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2480{
2481	struct ether_vlan_header *evl;
2482	struct ether_header *eh;
2483	uint32_t partial;
2484
2485	evl = mtod(m, struct ether_vlan_header *);
2486	eh = mtod(m, struct ether_header *);
2487
2488	/*
2489	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2490	 * after what the firmware thought was the end of the ethernet
2491	 * header.
2492	 */
2493
2494	/* put checksum into host byte order */
2495	*csum = ntohs(*csum);
2496	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2497	(*csum) += ~partial;
2498	(*csum) +=  ((*csum) < ~partial);
2499	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2500	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2501
2502	/* restore checksum to network byte order;
2503	   later consumers expect this */
2504	*csum = htons(*csum);
2505
2506	/* save the tag */
2507#ifdef MXGE_NEW_VLAN_API
2508	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2509#else
2510	{
2511		struct m_tag *mtag;
2512		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2513				   M_NOWAIT);
2514		if (mtag == NULL)
2515			return;
2516		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2517		m_tag_prepend(m, mtag);
2518	}
2519
2520#endif
2521	m->m_flags |= M_VLANTAG;
2522
2523	/*
2524	 * Remove the 802.1q header by copying the Ethernet
2525	 * addresses over it and adjusting the beginning of
2526	 * the data in the mbuf.  The encapsulated Ethernet
2527	 * type field is already in place.
2528	 */
2529	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2530	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2531	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2532}
2533
2534
2535static inline void
2536mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2537{
2538	mxge_softc_t *sc;
2539	struct ifnet *ifp;
2540	struct mbuf *m;
2541	struct ether_header *eh;
2542	mxge_rx_ring_t *rx;
2543	bus_dmamap_t old_map;
2544	int idx;
2545	uint16_t tcpudp_csum;
2546
2547	sc = ss->sc;
2548	ifp = sc->ifp;
2549	rx = &ss->rx_big;
2550	idx = rx->cnt & rx->mask;
2551	rx->cnt += rx->nbufs;
2552	/* save a pointer to the received mbuf */
2553	m = rx->info[idx].m;
2554	/* try to replace the received mbuf */
2555	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2556		/* drop the frame -- the old mbuf is re-cycled */
2557		ifp->if_ierrors++;
2558		return;
2559	}
2560
2561	/* unmap the received buffer */
2562	old_map = rx->info[idx].map;
2563	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2564	bus_dmamap_unload(rx->dmat, old_map);
2565
2566	/* swap the bus_dmamap_t's */
2567	rx->info[idx].map = rx->extra_map;
2568	rx->extra_map = old_map;
2569
2570	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2571	 * aligned */
2572	m->m_data += MXGEFW_PAD;
2573
2574	m->m_pkthdr.rcvif = ifp;
2575	m->m_len = m->m_pkthdr.len = len;
2576	ss->ipackets++;
2577	eh = mtod(m, struct ether_header *);
2578	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2579		mxge_vlan_tag_remove(m, &csum);
2580	}
2581	/* if the checksum is valid, mark it in the mbuf header */
2582	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2583		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2584			return;
2585		/* otherwise, it was a UDP frame, or a TCP frame which
2586		   we could not do LRO on.  Tell the stack that the
2587		   checksum is good */
2588		m->m_pkthdr.csum_data = 0xffff;
2589		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2590	}
2591	/* flowid only valid if RSS hashing is enabled */
2592	if (sc->num_slices > 1) {
2593		m->m_pkthdr.flowid = (ss - sc->ss);
2594		m->m_flags |= M_FLOWID;
2595	}
2596	/* pass the frame up the stack */
2597	(*ifp->if_input)(ifp, m);
2598}
2599
2600static inline void
2601mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2602{
2603	mxge_softc_t *sc;
2604	struct ifnet *ifp;
2605	struct ether_header *eh;
2606	struct mbuf *m;
2607	mxge_rx_ring_t *rx;
2608	bus_dmamap_t old_map;
2609	int idx;
2610	uint16_t tcpudp_csum;
2611
2612	sc = ss->sc;
2613	ifp = sc->ifp;
2614	rx = &ss->rx_small;
2615	idx = rx->cnt & rx->mask;
2616	rx->cnt++;
2617	/* save a pointer to the received mbuf */
2618	m = rx->info[idx].m;
2619	/* try to replace the received mbuf */
2620	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2621		/* drop the frame -- the old mbuf is re-cycled */
2622		ifp->if_ierrors++;
2623		return;
2624	}
2625
2626	/* unmap the received buffer */
2627	old_map = rx->info[idx].map;
2628	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2629	bus_dmamap_unload(rx->dmat, old_map);
2630
2631	/* swap the bus_dmamap_t's */
2632	rx->info[idx].map = rx->extra_map;
2633	rx->extra_map = old_map;
2634
2635	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2636	 * aligned */
2637	m->m_data += MXGEFW_PAD;
2638
2639	m->m_pkthdr.rcvif = ifp;
2640	m->m_len = m->m_pkthdr.len = len;
2641	ss->ipackets++;
2642	eh = mtod(m, struct ether_header *);
2643	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2644		mxge_vlan_tag_remove(m, &csum);
2645	}
2646	/* if the checksum is valid, mark it in the mbuf header */
2647	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2648		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2649			return;
2650		/* otherwise, it was a UDP frame, or a TCP frame which
2651		   we could not do LRO on.  Tell the stack that the
2652		   checksum is good */
2653		m->m_pkthdr.csum_data = 0xffff;
2654		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2655	}
2656	/* flowid only valid if RSS hashing is enabled */
2657	if (sc->num_slices > 1) {
2658		m->m_pkthdr.flowid = (ss - sc->ss);
2659		m->m_flags |= M_FLOWID;
2660	}
2661	/* pass the frame up the stack */
2662	(*ifp->if_input)(ifp, m);
2663}
2664
2665static inline void
2666mxge_clean_rx_done(struct mxge_slice_state *ss)
2667{
2668	mxge_rx_done_t *rx_done = &ss->rx_done;
2669	int limit = 0;
2670	uint16_t length;
2671	uint16_t checksum;
2672
2673
2674	while (rx_done->entry[rx_done->idx].length != 0) {
2675		length = ntohs(rx_done->entry[rx_done->idx].length);
2676		rx_done->entry[rx_done->idx].length = 0;
2677		checksum = rx_done->entry[rx_done->idx].checksum;
2678		if (length <= (MHLEN - MXGEFW_PAD))
2679			mxge_rx_done_small(ss, length, checksum);
2680		else
2681			mxge_rx_done_big(ss, length, checksum);
2682		rx_done->cnt++;
2683		rx_done->idx = rx_done->cnt & rx_done->mask;
2684
2685		/* limit potential for livelock */
2686		if (__predict_false(++limit > rx_done->mask / 2))
2687			break;
2688	}
2689#ifdef INET
2690	while (!SLIST_EMPTY(&ss->lro_active)) {
2691		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2692		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2693		mxge_lro_flush(ss, lro);
2694	}
2695#endif
2696}
2697
2698
2699static inline void
2700mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2701{
2702	struct ifnet *ifp;
2703	mxge_tx_ring_t *tx;
2704	struct mbuf *m;
2705	bus_dmamap_t map;
2706	int idx;
2707	int *flags;
2708
2709	tx = &ss->tx;
2710	ifp = ss->sc->ifp;
2711	while (tx->pkt_done != mcp_idx) {
2712		idx = tx->done & tx->mask;
2713		tx->done++;
2714		m = tx->info[idx].m;
2715		/* mbuf and DMA map only attached to the first
2716		   segment per-mbuf */
2717		if (m != NULL) {
2718			ss->obytes += m->m_pkthdr.len;
2719			if (m->m_flags & M_MCAST)
2720				ss->omcasts++;
2721			ss->opackets++;
2722			tx->info[idx].m = NULL;
2723			map = tx->info[idx].map;
2724			bus_dmamap_unload(tx->dmat, map);
2725			m_freem(m);
2726		}
2727		if (tx->info[idx].flag) {
2728			tx->info[idx].flag = 0;
2729			tx->pkt_done++;
2730		}
2731	}
2732
2733	/* If we have space, clear IFF_OACTIVE to tell the stack that
2734           its OK to send packets */
2735#ifdef IFNET_BUF_RING
2736	flags = &ss->if_drv_flags;
2737#else
2738	flags = &ifp->if_drv_flags;
2739#endif
2740	mtx_lock(&ss->tx.mtx);
2741	if ((*flags) & IFF_DRV_OACTIVE &&
2742	    tx->req - tx->done < (tx->mask + 1)/4) {
2743		*(flags) &= ~IFF_DRV_OACTIVE;
2744		ss->tx.wake++;
2745		mxge_start_locked(ss);
2746	}
2747#ifdef IFNET_BUF_RING
2748	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2749		/* let the NIC stop polling this queue, since there
2750		 * are no more transmits pending */
2751		if (tx->req == tx->done) {
2752			*tx->send_stop = 1;
2753			tx->queue_active = 0;
2754			tx->deactivate++;
2755			wmb();
2756		}
2757	}
2758#endif
2759	mtx_unlock(&ss->tx.mtx);
2760
2761}
2762
2763static struct mxge_media_type mxge_xfp_media_types[] =
2764{
2765	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2766	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2767	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2768	{0,		(1 << 5),	"10GBASE-ER"},
2769	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2770	{0,		(1 << 3),	"10GBASE-SW"},
2771	{0,		(1 << 2),	"10GBASE-LW"},
2772	{0,		(1 << 1),	"10GBASE-EW"},
2773	{0,		(1 << 0),	"Reserved"}
2774};
2775static struct mxge_media_type mxge_sfp_media_types[] =
2776{
2777	{0,		(1 << 7),	"Reserved"},
2778	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2779	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2780	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"}
2781};
2782
2783static void
2784mxge_set_media(mxge_softc_t *sc, int type)
2785{
2786	sc->media_flags |= type;
2787	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2788	ifmedia_set(&sc->media, sc->media_flags);
2789}
2790
2791
2792/*
2793 * Determine the media type for a NIC.  Some XFPs will identify
2794 * themselves only when their link is up, so this is initiated via a
2795 * link up interrupt.  However, this can potentially take up to
2796 * several milliseconds, so it is run via the watchdog routine, rather
2797 * than in the interrupt handler itself.   This need only be done
2798 * once, not each time the link is up.
2799 */
2800static void
2801mxge_media_probe(mxge_softc_t *sc)
2802{
2803	mxge_cmd_t cmd;
2804	char *cage_type;
2805	char *ptr;
2806	struct mxge_media_type *mxge_media_types = NULL;
2807	int i, err, ms, mxge_media_type_entries;
2808	uint32_t byte;
2809
2810	sc->need_media_probe = 0;
2811
2812	/* if we've already set a media type, we're done */
2813	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2814		return;
2815
2816	/*
2817	 * parse the product code to deterimine the interface type
2818	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2819	 * after the 3rd dash in the driver's cached copy of the
2820	 * EEPROM's product code string.
2821	 */
2822	ptr = sc->product_code_string;
2823	if (ptr == NULL) {
2824		device_printf(sc->dev, "Missing product code\n");
2825	}
2826
2827	for (i = 0; i < 3; i++, ptr++) {
2828		ptr = index(ptr, '-');
2829		if (ptr == NULL) {
2830			device_printf(sc->dev,
2831				      "only %d dashes in PC?!?\n", i);
2832			return;
2833		}
2834	}
2835	if (*ptr == 'C') {
2836		/* -C is CX4 */
2837		mxge_set_media(sc, IFM_10G_CX4);
2838		return;
2839	}
2840	else if (*ptr == 'Q') {
2841		/* -Q is Quad Ribbon Fiber */
2842		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2843		/* FreeBSD has no media type for Quad ribbon fiber */
2844		return;
2845	}
2846
2847	if (*ptr == 'R') {
2848		/* -R is XFP */
2849		mxge_media_types = mxge_xfp_media_types;
2850		mxge_media_type_entries =
2851			sizeof (mxge_xfp_media_types) /
2852			sizeof (mxge_xfp_media_types[0]);
2853		byte = MXGE_XFP_COMPLIANCE_BYTE;
2854		cage_type = "XFP";
2855	}
2856
2857	if (*ptr == 'S' || *(ptr +1) == 'S') {
2858		/* -S or -2S is SFP+ */
2859		mxge_media_types = mxge_sfp_media_types;
2860		mxge_media_type_entries =
2861			sizeof (mxge_sfp_media_types) /
2862			sizeof (mxge_sfp_media_types[0]);
2863		cage_type = "SFP+";
2864		byte = 3;
2865	}
2866
2867	if (mxge_media_types == NULL) {
2868		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2869		return;
2870	}
2871
2872	/*
2873	 * At this point we know the NIC has an XFP cage, so now we
2874	 * try to determine what is in the cage by using the
2875	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2876	 * register.  We read just one byte, which may take over
2877	 * a millisecond
2878	 */
2879
2880	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2881	cmd.data1 = byte;
2882	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2883	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2884		device_printf(sc->dev, "failed to read XFP\n");
2885	}
2886	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2887		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2888	}
2889	if (err != MXGEFW_CMD_OK) {
2890		return;
2891	}
2892
2893	/* now we wait for the data to be cached */
2894	cmd.data0 = byte;
2895	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2896	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2897		DELAY(1000);
2898		cmd.data0 = byte;
2899		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2900	}
2901	if (err != MXGEFW_CMD_OK) {
2902		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2903			      cage_type, err, ms);
2904		return;
2905	}
2906
2907	if (cmd.data0 == mxge_media_types[0].bitmask) {
2908		if (mxge_verbose)
2909			device_printf(sc->dev, "%s:%s\n", cage_type,
2910				      mxge_media_types[0].name);
2911		mxge_set_media(sc, IFM_10G_CX4);
2912		return;
2913	}
2914	for (i = 1; i < mxge_media_type_entries; i++) {
2915		if (cmd.data0 & mxge_media_types[i].bitmask) {
2916			if (mxge_verbose)
2917				device_printf(sc->dev, "%s:%s\n",
2918					      cage_type,
2919					      mxge_media_types[i].name);
2920
2921			mxge_set_media(sc, mxge_media_types[i].flag);
2922			return;
2923		}
2924	}
2925	device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2926		      cmd.data0);
2927
2928	return;
2929}
2930
2931static void
2932mxge_intr(void *arg)
2933{
2934	struct mxge_slice_state *ss = arg;
2935	mxge_softc_t *sc = ss->sc;
2936	mcp_irq_data_t *stats = ss->fw_stats;
2937	mxge_tx_ring_t *tx = &ss->tx;
2938	mxge_rx_done_t *rx_done = &ss->rx_done;
2939	uint32_t send_done_count;
2940	uint8_t valid;
2941
2942
2943#ifndef IFNET_BUF_RING
2944	/* an interrupt on a non-zero slice is implicitly valid
2945	   since MSI-X irqs are not shared */
2946	if (ss != sc->ss) {
2947		mxge_clean_rx_done(ss);
2948		*ss->irq_claim = be32toh(3);
2949		return;
2950	}
2951#endif
2952
2953	/* make sure the DMA has finished */
2954	if (!stats->valid) {
2955		return;
2956	}
2957	valid = stats->valid;
2958
2959	if (sc->legacy_irq) {
2960		/* lower legacy IRQ  */
2961		*sc->irq_deassert = 0;
2962		if (!mxge_deassert_wait)
2963			/* don't wait for conf. that irq is low */
2964			stats->valid = 0;
2965	} else {
2966		stats->valid = 0;
2967	}
2968
2969	/* loop while waiting for legacy irq deassertion */
2970	do {
2971		/* check for transmit completes and receives */
2972		send_done_count = be32toh(stats->send_done_count);
2973		while ((send_done_count != tx->pkt_done) ||
2974		       (rx_done->entry[rx_done->idx].length != 0)) {
2975			if (send_done_count != tx->pkt_done)
2976				mxge_tx_done(ss, (int)send_done_count);
2977			mxge_clean_rx_done(ss);
2978			send_done_count = be32toh(stats->send_done_count);
2979		}
2980		if (sc->legacy_irq && mxge_deassert_wait)
2981			wmb();
2982	} while (*((volatile uint8_t *) &stats->valid));
2983
2984	/* fw link & error stats meaningful only on the first slice */
2985	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2986		if (sc->link_state != stats->link_up) {
2987			sc->link_state = stats->link_up;
2988			if (sc->link_state) {
2989				if_link_state_change(sc->ifp, LINK_STATE_UP);
2990				if (mxge_verbose)
2991					device_printf(sc->dev, "link up\n");
2992			} else {
2993				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2994				if (mxge_verbose)
2995					device_printf(sc->dev, "link down\n");
2996			}
2997			sc->need_media_probe = 1;
2998		}
2999		if (sc->rdma_tags_available !=
3000		    be32toh(stats->rdma_tags_available)) {
3001			sc->rdma_tags_available =
3002				be32toh(stats->rdma_tags_available);
3003			device_printf(sc->dev, "RDMA timed out! %d tags "
3004				      "left\n", sc->rdma_tags_available);
3005		}
3006
3007		if (stats->link_down) {
3008			sc->down_cnt += stats->link_down;
3009			sc->link_state = 0;
3010			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3011		}
3012	}
3013
3014	/* check to see if we have rx token to pass back */
3015	if (valid & 0x1)
3016	    *ss->irq_claim = be32toh(3);
3017	*(ss->irq_claim + 1) = be32toh(3);
3018}
3019
3020static void
3021mxge_init(void *arg)
3022{
3023}
3024
3025
3026
3027static void
3028mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3029{
3030	struct lro_entry *lro_entry;
3031	int i;
3032
3033	while (!SLIST_EMPTY(&ss->lro_free)) {
3034		lro_entry = SLIST_FIRST(&ss->lro_free);
3035		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3036		free(lro_entry, M_DEVBUF);
3037	}
3038
3039	for (i = 0; i <= ss->rx_big.mask; i++) {
3040		if (ss->rx_big.info[i].m == NULL)
3041			continue;
3042		bus_dmamap_unload(ss->rx_big.dmat,
3043				  ss->rx_big.info[i].map);
3044		m_freem(ss->rx_big.info[i].m);
3045		ss->rx_big.info[i].m = NULL;
3046	}
3047
3048	for (i = 0; i <= ss->rx_small.mask; i++) {
3049		if (ss->rx_small.info[i].m == NULL)
3050			continue;
3051		bus_dmamap_unload(ss->rx_small.dmat,
3052				  ss->rx_small.info[i].map);
3053		m_freem(ss->rx_small.info[i].m);
3054		ss->rx_small.info[i].m = NULL;
3055	}
3056
3057	/* transmit ring used only on the first slice */
3058	if (ss->tx.info == NULL)
3059		return;
3060
3061	for (i = 0; i <= ss->tx.mask; i++) {
3062		ss->tx.info[i].flag = 0;
3063		if (ss->tx.info[i].m == NULL)
3064			continue;
3065		bus_dmamap_unload(ss->tx.dmat,
3066				  ss->tx.info[i].map);
3067		m_freem(ss->tx.info[i].m);
3068		ss->tx.info[i].m = NULL;
3069	}
3070}
3071
3072static void
3073mxge_free_mbufs(mxge_softc_t *sc)
3074{
3075	int slice;
3076
3077	for (slice = 0; slice < sc->num_slices; slice++)
3078		mxge_free_slice_mbufs(&sc->ss[slice]);
3079}
3080
3081static void
3082mxge_free_slice_rings(struct mxge_slice_state *ss)
3083{
3084	int i;
3085
3086
3087	if (ss->rx_done.entry != NULL)
3088		mxge_dma_free(&ss->rx_done.dma);
3089	ss->rx_done.entry = NULL;
3090
3091	if (ss->tx.req_bytes != NULL)
3092		free(ss->tx.req_bytes, M_DEVBUF);
3093	ss->tx.req_bytes = NULL;
3094
3095	if (ss->tx.seg_list != NULL)
3096		free(ss->tx.seg_list, M_DEVBUF);
3097	ss->tx.seg_list = NULL;
3098
3099	if (ss->rx_small.shadow != NULL)
3100		free(ss->rx_small.shadow, M_DEVBUF);
3101	ss->rx_small.shadow = NULL;
3102
3103	if (ss->rx_big.shadow != NULL)
3104		free(ss->rx_big.shadow, M_DEVBUF);
3105	ss->rx_big.shadow = NULL;
3106
3107	if (ss->tx.info != NULL) {
3108		if (ss->tx.dmat != NULL) {
3109			for (i = 0; i <= ss->tx.mask; i++) {
3110				bus_dmamap_destroy(ss->tx.dmat,
3111						   ss->tx.info[i].map);
3112			}
3113			bus_dma_tag_destroy(ss->tx.dmat);
3114		}
3115		free(ss->tx.info, M_DEVBUF);
3116	}
3117	ss->tx.info = NULL;
3118
3119	if (ss->rx_small.info != NULL) {
3120		if (ss->rx_small.dmat != NULL) {
3121			for (i = 0; i <= ss->rx_small.mask; i++) {
3122				bus_dmamap_destroy(ss->rx_small.dmat,
3123						   ss->rx_small.info[i].map);
3124			}
3125			bus_dmamap_destroy(ss->rx_small.dmat,
3126					   ss->rx_small.extra_map);
3127			bus_dma_tag_destroy(ss->rx_small.dmat);
3128		}
3129		free(ss->rx_small.info, M_DEVBUF);
3130	}
3131	ss->rx_small.info = NULL;
3132
3133	if (ss->rx_big.info != NULL) {
3134		if (ss->rx_big.dmat != NULL) {
3135			for (i = 0; i <= ss->rx_big.mask; i++) {
3136				bus_dmamap_destroy(ss->rx_big.dmat,
3137						   ss->rx_big.info[i].map);
3138			}
3139			bus_dmamap_destroy(ss->rx_big.dmat,
3140					   ss->rx_big.extra_map);
3141			bus_dma_tag_destroy(ss->rx_big.dmat);
3142		}
3143		free(ss->rx_big.info, M_DEVBUF);
3144	}
3145	ss->rx_big.info = NULL;
3146}
3147
3148static void
3149mxge_free_rings(mxge_softc_t *sc)
3150{
3151	int slice;
3152
3153	for (slice = 0; slice < sc->num_slices; slice++)
3154		mxge_free_slice_rings(&sc->ss[slice]);
3155}
3156
3157static int
3158mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3159		       int tx_ring_entries)
3160{
3161	mxge_softc_t *sc = ss->sc;
3162	size_t bytes;
3163	int err, i;
3164
3165	err = ENOMEM;
3166
3167	/* allocate per-slice receive resources */
3168
3169	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3170	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3171
3172	/* allocate the rx shadow rings */
3173	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3174	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3175	if (ss->rx_small.shadow == NULL)
3176		return err;;
3177
3178	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3179	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3180	if (ss->rx_big.shadow == NULL)
3181		return err;;
3182
3183	/* allocate the rx host info rings */
3184	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3185	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3186	if (ss->rx_small.info == NULL)
3187		return err;;
3188
3189	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3190	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3191	if (ss->rx_big.info == NULL)
3192		return err;;
3193
3194	/* allocate the rx busdma resources */
3195	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3196				 1,			/* alignment */
3197				 4096,			/* boundary */
3198				 BUS_SPACE_MAXADDR,	/* low */
3199				 BUS_SPACE_MAXADDR,	/* high */
3200				 NULL, NULL,		/* filter */
3201				 MHLEN,			/* maxsize */
3202				 1,			/* num segs */
3203				 MHLEN,			/* maxsegsize */
3204				 BUS_DMA_ALLOCNOW,	/* flags */
3205				 NULL, NULL,		/* lock */
3206				 &ss->rx_small.dmat);	/* tag */
3207	if (err != 0) {
3208		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3209			      err);
3210		return err;;
3211	}
3212
3213	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3214				 1,			/* alignment */
3215#if MXGE_VIRT_JUMBOS
3216				 4096,			/* boundary */
3217#else
3218				 0,			/* boundary */
3219#endif
3220				 BUS_SPACE_MAXADDR,	/* low */
3221				 BUS_SPACE_MAXADDR,	/* high */
3222				 NULL, NULL,		/* filter */
3223				 3*4096,		/* maxsize */
3224#if MXGE_VIRT_JUMBOS
3225				 3,			/* num segs */
3226				 4096,			/* maxsegsize*/
3227#else
3228				 1,			/* num segs */
3229				 MJUM9BYTES,		/* maxsegsize*/
3230#endif
3231				 BUS_DMA_ALLOCNOW,	/* flags */
3232				 NULL, NULL,		/* lock */
3233				 &ss->rx_big.dmat);	/* tag */
3234	if (err != 0) {
3235		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3236			      err);
3237		return err;;
3238	}
3239	for (i = 0; i <= ss->rx_small.mask; i++) {
3240		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3241					&ss->rx_small.info[i].map);
3242		if (err != 0) {
3243			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3244				      err);
3245			return err;;
3246		}
3247	}
3248	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3249				&ss->rx_small.extra_map);
3250	if (err != 0) {
3251		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3252			      err);
3253		return err;;
3254	}
3255
3256	for (i = 0; i <= ss->rx_big.mask; i++) {
3257		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3258					&ss->rx_big.info[i].map);
3259		if (err != 0) {
3260			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3261				      err);
3262			return err;;
3263		}
3264	}
3265	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3266				&ss->rx_big.extra_map);
3267	if (err != 0) {
3268		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3269			      err);
3270		return err;;
3271	}
3272
3273	/* now allocate TX resouces */
3274
3275#ifndef IFNET_BUF_RING
3276	/* only use a single TX ring for now */
3277	if (ss != ss->sc->ss)
3278		return 0;
3279#endif
3280
3281	ss->tx.mask = tx_ring_entries - 1;
3282	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3283
3284
3285	/* allocate the tx request copy block */
3286	bytes = 8 +
3287		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3288	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3289	if (ss->tx.req_bytes == NULL)
3290		return err;;
3291	/* ensure req_list entries are aligned to 8 bytes */
3292	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3293		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3294
3295	/* allocate the tx busdma segment list */
3296	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3297	ss->tx.seg_list = (bus_dma_segment_t *)
3298		malloc(bytes, M_DEVBUF, M_WAITOK);
3299	if (ss->tx.seg_list == NULL)
3300		return err;;
3301
3302	/* allocate the tx host info ring */
3303	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3304	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3305	if (ss->tx.info == NULL)
3306		return err;;
3307
3308	/* allocate the tx busdma resources */
3309	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3310				 1,			/* alignment */
3311				 sc->tx_boundary,	/* boundary */
3312				 BUS_SPACE_MAXADDR,	/* low */
3313				 BUS_SPACE_MAXADDR,	/* high */
3314				 NULL, NULL,		/* filter */
3315				 65536 + 256,		/* maxsize */
3316				 ss->tx.max_desc - 2,	/* num segs */
3317				 sc->tx_boundary,	/* maxsegsz */
3318				 BUS_DMA_ALLOCNOW,	/* flags */
3319				 NULL, NULL,		/* lock */
3320				 &ss->tx.dmat);		/* tag */
3321
3322	if (err != 0) {
3323		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3324			      err);
3325		return err;;
3326	}
3327
3328	/* now use these tags to setup dmamaps for each slot
3329	   in the ring */
3330	for (i = 0; i <= ss->tx.mask; i++) {
3331		err = bus_dmamap_create(ss->tx.dmat, 0,
3332					&ss->tx.info[i].map);
3333		if (err != 0) {
3334			device_printf(sc->dev, "Err %d  tx dmamap\n",
3335				      err);
3336			return err;;
3337		}
3338	}
3339	return 0;
3340
3341}
3342
3343static int
3344mxge_alloc_rings(mxge_softc_t *sc)
3345{
3346	mxge_cmd_t cmd;
3347	int tx_ring_size;
3348	int tx_ring_entries, rx_ring_entries;
3349	int err, slice;
3350
3351	/* get ring sizes */
3352	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3353	tx_ring_size = cmd.data0;
3354	if (err != 0) {
3355		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3356		goto abort;
3357	}
3358
3359	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3360	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3361	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3362	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3363	IFQ_SET_READY(&sc->ifp->if_snd);
3364
3365	for (slice = 0; slice < sc->num_slices; slice++) {
3366		err = mxge_alloc_slice_rings(&sc->ss[slice],
3367					     rx_ring_entries,
3368					     tx_ring_entries);
3369		if (err != 0)
3370			goto abort;
3371	}
3372	return 0;
3373
3374abort:
3375	mxge_free_rings(sc);
3376	return err;
3377
3378}
3379
3380
3381static void
3382mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3383{
3384	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3385
3386	if (bufsize < MCLBYTES) {
3387		/* easy, everything fits in a single buffer */
3388		*big_buf_size = MCLBYTES;
3389		*cl_size = MCLBYTES;
3390		*nbufs = 1;
3391		return;
3392	}
3393
3394	if (bufsize < MJUMPAGESIZE) {
3395		/* still easy, everything still fits in a single buffer */
3396		*big_buf_size = MJUMPAGESIZE;
3397		*cl_size = MJUMPAGESIZE;
3398		*nbufs = 1;
3399		return;
3400	}
3401#if MXGE_VIRT_JUMBOS
3402	/* now we need to use virtually contiguous buffers */
3403	*cl_size = MJUM9BYTES;
3404	*big_buf_size = 4096;
3405	*nbufs = mtu / 4096 + 1;
3406	/* needs to be a power of two, so round up */
3407	if (*nbufs == 3)
3408		*nbufs = 4;
3409#else
3410	*cl_size = MJUM9BYTES;
3411	*big_buf_size = MJUM9BYTES;
3412	*nbufs = 1;
3413#endif
3414}
3415
3416static int
3417mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3418{
3419	mxge_softc_t *sc;
3420	mxge_cmd_t cmd;
3421	bus_dmamap_t map;
3422	struct lro_entry *lro_entry;
3423	int err, i, slice;
3424
3425
3426	sc = ss->sc;
3427	slice = ss - sc->ss;
3428
3429	SLIST_INIT(&ss->lro_free);
3430	SLIST_INIT(&ss->lro_active);
3431
3432	for (i = 0; i < sc->lro_cnt; i++) {
3433		lro_entry = (struct lro_entry *)
3434			malloc(sizeof (*lro_entry), M_DEVBUF,
3435			       M_NOWAIT | M_ZERO);
3436		if (lro_entry == NULL) {
3437			sc->lro_cnt = i;
3438			break;
3439		}
3440		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3441	}
3442	/* get the lanai pointers to the send and receive rings */
3443
3444	err = 0;
3445#ifndef IFNET_BUF_RING
3446	/* We currently only send from the first slice */
3447	if (slice == 0) {
3448#endif
3449		cmd.data0 = slice;
3450		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3451		ss->tx.lanai =
3452			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3453		ss->tx.send_go = (volatile uint32_t *)
3454			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3455		ss->tx.send_stop = (volatile uint32_t *)
3456		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3457#ifndef IFNET_BUF_RING
3458	}
3459#endif
3460	cmd.data0 = slice;
3461	err |= mxge_send_cmd(sc,
3462			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3463	ss->rx_small.lanai =
3464		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3465	cmd.data0 = slice;
3466	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3467	ss->rx_big.lanai =
3468		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3469
3470	if (err != 0) {
3471		device_printf(sc->dev,
3472			      "failed to get ring sizes or locations\n");
3473		return EIO;
3474	}
3475
3476	/* stock receive rings */
3477	for (i = 0; i <= ss->rx_small.mask; i++) {
3478		map = ss->rx_small.info[i].map;
3479		err = mxge_get_buf_small(ss, map, i);
3480		if (err) {
3481			device_printf(sc->dev, "alloced %d/%d smalls\n",
3482				      i, ss->rx_small.mask + 1);
3483			return ENOMEM;
3484		}
3485	}
3486	for (i = 0; i <= ss->rx_big.mask; i++) {
3487		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3488		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3489	}
3490	ss->rx_big.nbufs = nbufs;
3491	ss->rx_big.cl_size = cl_size;
3492	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3493		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3494	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3495		map = ss->rx_big.info[i].map;
3496		err = mxge_get_buf_big(ss, map, i);
3497		if (err) {
3498			device_printf(sc->dev, "alloced %d/%d bigs\n",
3499				      i, ss->rx_big.mask + 1);
3500			return ENOMEM;
3501		}
3502	}
3503	return 0;
3504}
3505
3506static int
3507mxge_open(mxge_softc_t *sc)
3508{
3509	mxge_cmd_t cmd;
3510	int err, big_bytes, nbufs, slice, cl_size, i;
3511	bus_addr_t bus;
3512	volatile uint8_t *itable;
3513	struct mxge_slice_state *ss;
3514
3515	/* Copy the MAC address in case it was overridden */
3516	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3517
3518	err = mxge_reset(sc, 1);
3519	if (err != 0) {
3520		device_printf(sc->dev, "failed to reset\n");
3521		return EIO;
3522	}
3523
3524	if (sc->num_slices > 1) {
3525		/* setup the indirection table */
3526		cmd.data0 = sc->num_slices;
3527		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3528				    &cmd);
3529
3530		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3531				     &cmd);
3532		if (err != 0) {
3533			device_printf(sc->dev,
3534				      "failed to setup rss tables\n");
3535			return err;
3536		}
3537
3538		/* just enable an identity mapping */
3539		itable = sc->sram + cmd.data0;
3540		for (i = 0; i < sc->num_slices; i++)
3541			itable[i] = (uint8_t)i;
3542
3543		cmd.data0 = 1;
3544		cmd.data1 = mxge_rss_hash_type;
3545		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3546		if (err != 0) {
3547			device_printf(sc->dev, "failed to enable slices\n");
3548			return err;
3549		}
3550	}
3551
3552
3553	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3554
3555	cmd.data0 = nbufs;
3556	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3557			    &cmd);
3558	/* error is only meaningful if we're trying to set
3559	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3560	if (err && nbufs > 1) {
3561		device_printf(sc->dev,
3562			      "Failed to set alway-use-n to %d\n",
3563			      nbufs);
3564		return EIO;
3565	}
3566	/* Give the firmware the mtu and the big and small buffer
3567	   sizes.  The firmware wants the big buf size to be a power
3568	   of two. Luckily, FreeBSD's clusters are powers of two */
3569	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3570	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3571	cmd.data0 = MHLEN - MXGEFW_PAD;
3572	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3573			     &cmd);
3574	cmd.data0 = big_bytes;
3575	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3576
3577	if (err != 0) {
3578		device_printf(sc->dev, "failed to setup params\n");
3579		goto abort;
3580	}
3581
3582	/* Now give him the pointer to the stats block */
3583	for (slice = 0;
3584#ifdef IFNET_BUF_RING
3585	     slice < sc->num_slices;
3586#else
3587	     slice < 1;
3588#endif
3589	     slice++) {
3590		ss = &sc->ss[slice];
3591		cmd.data0 =
3592			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3593		cmd.data1 =
3594			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3595		cmd.data2 = sizeof(struct mcp_irq_data);
3596		cmd.data2 |= (slice << 16);
3597		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3598	}
3599
3600	if (err != 0) {
3601		bus = sc->ss->fw_stats_dma.bus_addr;
3602		bus += offsetof(struct mcp_irq_data, send_done_count);
3603		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3604		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3605		err = mxge_send_cmd(sc,
3606				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3607				    &cmd);
3608		/* Firmware cannot support multicast without STATS_DMA_V2 */
3609		sc->fw_multicast_support = 0;
3610	} else {
3611		sc->fw_multicast_support = 1;
3612	}
3613
3614	if (err != 0) {
3615		device_printf(sc->dev, "failed to setup params\n");
3616		goto abort;
3617	}
3618
3619	for (slice = 0; slice < sc->num_slices; slice++) {
3620		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3621		if (err != 0) {
3622			device_printf(sc->dev, "couldn't open slice %d\n",
3623				      slice);
3624			goto abort;
3625		}
3626	}
3627
3628	/* Finally, start the firmware running */
3629	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3630	if (err) {
3631		device_printf(sc->dev, "Couldn't bring up link\n");
3632		goto abort;
3633	}
3634#ifdef IFNET_BUF_RING
3635	for (slice = 0; slice < sc->num_slices; slice++) {
3636		ss = &sc->ss[slice];
3637		ss->if_drv_flags |= IFF_DRV_RUNNING;
3638		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3639	}
3640#endif
3641	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3642	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3643	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3644
3645	return 0;
3646
3647
3648abort:
3649	mxge_free_mbufs(sc);
3650
3651	return err;
3652}
3653
3654static int
3655mxge_close(mxge_softc_t *sc, int down)
3656{
3657	mxge_cmd_t cmd;
3658	int err, old_down_cnt;
3659#ifdef IFNET_BUF_RING
3660	struct mxge_slice_state *ss;
3661	int slice;
3662#endif
3663
3664	callout_stop(&sc->co_hdl);
3665#ifdef IFNET_BUF_RING
3666	for (slice = 0; slice < sc->num_slices; slice++) {
3667		ss = &sc->ss[slice];
3668		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3669	}
3670#endif
3671	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3672	if (!down) {
3673		old_down_cnt = sc->down_cnt;
3674		wmb();
3675		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3676		if (err) {
3677			device_printf(sc->dev,
3678				      "Couldn't bring down link\n");
3679		}
3680		if (old_down_cnt == sc->down_cnt) {
3681			/* wait for down irq */
3682			DELAY(10 * sc->intr_coal_delay);
3683		}
3684		wmb();
3685		if (old_down_cnt == sc->down_cnt) {
3686			device_printf(sc->dev, "never got down irq\n");
3687		}
3688	}
3689	mxge_free_mbufs(sc);
3690
3691	return 0;
3692}
3693
3694static void
3695mxge_setup_cfg_space(mxge_softc_t *sc)
3696{
3697	device_t dev = sc->dev;
3698	int reg;
3699	uint16_t cmd, lnk, pectl;
3700
3701	/* find the PCIe link width and set max read request to 4KB*/
3702	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3703		lnk = pci_read_config(dev, reg + 0x12, 2);
3704		sc->link_width = (lnk >> 4) & 0x3f;
3705
3706		if (sc->pectl == 0) {
3707			pectl = pci_read_config(dev, reg + 0x8, 2);
3708			pectl = (pectl & ~0x7000) | (5 << 12);
3709			pci_write_config(dev, reg + 0x8, pectl, 2);
3710			sc->pectl = pectl;
3711		} else {
3712			/* restore saved pectl after watchdog reset */
3713			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3714		}
3715	}
3716
3717	/* Enable DMA and Memory space access */
3718	pci_enable_busmaster(dev);
3719	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3720	cmd |= PCIM_CMD_MEMEN;
3721	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3722}
3723
3724static uint32_t
3725mxge_read_reboot(mxge_softc_t *sc)
3726{
3727	device_t dev = sc->dev;
3728	uint32_t vs;
3729
3730	/* find the vendor specific offset */
3731	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3732		device_printf(sc->dev,
3733			      "could not find vendor specific offset\n");
3734		return (uint32_t)-1;
3735	}
3736	/* enable read32 mode */
3737	pci_write_config(dev, vs + 0x10, 0x3, 1);
3738	/* tell NIC which register to read */
3739	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3740	return (pci_read_config(dev, vs + 0x14, 4));
3741}
3742
3743static void
3744mxge_watchdog_reset(mxge_softc_t *sc)
3745{
3746	struct pci_devinfo *dinfo;
3747	struct mxge_slice_state *ss;
3748	int err, running, s, num_tx_slices = 1;
3749	uint32_t reboot;
3750	uint16_t cmd;
3751
3752	err = ENXIO;
3753
3754	device_printf(sc->dev, "Watchdog reset!\n");
3755
3756	/*
3757	 * check to see if the NIC rebooted.  If it did, then all of
3758	 * PCI config space has been reset, and things like the
3759	 * busmaster bit will be zero.  If this is the case, then we
3760	 * must restore PCI config space before the NIC can be used
3761	 * again
3762	 */
3763	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3764	if (cmd == 0xffff) {
3765		/*
3766		 * maybe the watchdog caught the NIC rebooting; wait
3767		 * up to 100ms for it to finish.  If it does not come
3768		 * back, then give up
3769		 */
3770		DELAY(1000*100);
3771		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3772		if (cmd == 0xffff) {
3773			device_printf(sc->dev, "NIC disappeared!\n");
3774		}
3775	}
3776	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3777		/* print the reboot status */
3778		reboot = mxge_read_reboot(sc);
3779		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3780			      reboot);
3781		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3782		if (running) {
3783
3784			/*
3785			 * quiesce NIC so that TX routines will not try to
3786			 * xmit after restoration of BAR
3787			 */
3788
3789			/* Mark the link as down */
3790			if (sc->link_state) {
3791				sc->link_state = 0;
3792				if_link_state_change(sc->ifp,
3793						     LINK_STATE_DOWN);
3794			}
3795#ifdef IFNET_BUF_RING
3796			num_tx_slices = sc->num_slices;
3797#endif
3798			/* grab all TX locks to ensure no tx  */
3799			for (s = 0; s < num_tx_slices; s++) {
3800				ss = &sc->ss[s];
3801				mtx_lock(&ss->tx.mtx);
3802			}
3803			mxge_close(sc, 1);
3804		}
3805		/* restore PCI configuration space */
3806		dinfo = device_get_ivars(sc->dev);
3807		pci_cfg_restore(sc->dev, dinfo);
3808
3809		/* and redo any changes we made to our config space */
3810		mxge_setup_cfg_space(sc);
3811
3812		/* reload f/w */
3813		err = mxge_load_firmware(sc, 0);
3814		if (err) {
3815			device_printf(sc->dev,
3816				      "Unable to re-load f/w\n");
3817		}
3818		if (running) {
3819			if (!err)
3820				err = mxge_open(sc);
3821			/* release all TX locks */
3822			for (s = 0; s < num_tx_slices; s++) {
3823				ss = &sc->ss[s];
3824#ifdef IFNET_BUF_RING
3825				mxge_start_locked(ss);
3826#endif
3827				mtx_unlock(&ss->tx.mtx);
3828			}
3829		}
3830		sc->watchdog_resets++;
3831	} else {
3832		device_printf(sc->dev,
3833			      "NIC did not reboot, not resetting\n");
3834		err = 0;
3835	}
3836	if (err) {
3837		device_printf(sc->dev, "watchdog reset failed\n");
3838	} else {
3839		if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
3840			callout_reset(&sc->co_hdl, mxge_ticks,
3841				      mxge_tick, sc);
3842	}
3843}
3844
3845static void
3846mxge_watchdog_task(void *arg, int pending)
3847{
3848	mxge_softc_t *sc = arg;
3849
3850
3851	mtx_lock(&sc->driver_mtx);
3852	mxge_watchdog_reset(sc);
3853	mtx_unlock(&sc->driver_mtx);
3854}
3855
3856static void
3857mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3858{
3859	tx = &sc->ss[slice].tx;
3860	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3861	device_printf(sc->dev,
3862		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3863		      tx->req, tx->done, tx->queue_active);
3864	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3865			      tx->activate, tx->deactivate);
3866	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3867		      tx->pkt_done,
3868		      be32toh(sc->ss->fw_stats->send_done_count));
3869}
3870
3871static int
3872mxge_watchdog(mxge_softc_t *sc)
3873{
3874	mxge_tx_ring_t *tx;
3875	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3876	int i, err = 0;
3877
3878	/* see if we have outstanding transmits, which
3879	   have been pending for more than mxge_ticks */
3880	for (i = 0;
3881#ifdef IFNET_BUF_RING
3882	     (i < sc->num_slices) && (err == 0);
3883#else
3884	     (i < 1) && (err == 0);
3885#endif
3886	     i++) {
3887		tx = &sc->ss[i].tx;
3888		if (tx->req != tx->done &&
3889		    tx->watchdog_req != tx->watchdog_done &&
3890		    tx->done == tx->watchdog_done) {
3891			/* check for pause blocking before resetting */
3892			if (tx->watchdog_rx_pause == rx_pause) {
3893				mxge_warn_stuck(sc, tx, i);
3894				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3895				return (ENXIO);
3896			}
3897			else
3898				device_printf(sc->dev, "Flow control blocking "
3899					      "xmits, check link partner\n");
3900		}
3901
3902		tx->watchdog_req = tx->req;
3903		tx->watchdog_done = tx->done;
3904		tx->watchdog_rx_pause = rx_pause;
3905	}
3906
3907	if (sc->need_media_probe)
3908		mxge_media_probe(sc);
3909	return (err);
3910}
3911
3912static void
3913mxge_update_stats(mxge_softc_t *sc)
3914{
3915	struct mxge_slice_state *ss;
3916	u_long ipackets = 0;
3917	u_long opackets = 0;
3918#ifdef IFNET_BUF_RING
3919	u_long obytes = 0;
3920	u_long omcasts = 0;
3921	u_long odrops = 0;
3922#endif
3923	u_long oerrors = 0;
3924	int slice;
3925
3926	for (slice = 0; slice < sc->num_slices; slice++) {
3927		ss = &sc->ss[slice];
3928		ipackets += ss->ipackets;
3929		opackets += ss->opackets;
3930#ifdef IFNET_BUF_RING
3931		obytes += ss->obytes;
3932		omcasts += ss->omcasts;
3933		odrops += ss->tx.br->br_drops;
3934#endif
3935		oerrors += ss->oerrors;
3936	}
3937	sc->ifp->if_ipackets = ipackets;
3938	sc->ifp->if_opackets = opackets;
3939#ifdef IFNET_BUF_RING
3940	sc->ifp->if_obytes = obytes;
3941	sc->ifp->if_omcasts = omcasts;
3942	sc->ifp->if_snd.ifq_drops = odrops;
3943#endif
3944	sc->ifp->if_oerrors = oerrors;
3945}
3946
3947static void
3948mxge_tick(void *arg)
3949{
3950	mxge_softc_t *sc = arg;
3951	int err = 0;
3952
3953	/* aggregate stats from different slices */
3954	mxge_update_stats(sc);
3955	if (!sc->watchdog_countdown) {
3956		err = mxge_watchdog(sc);
3957		sc->watchdog_countdown = 4;
3958	}
3959	sc->watchdog_countdown--;
3960	if (err == 0)
3961		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3962
3963}
3964
3965static int
3966mxge_media_change(struct ifnet *ifp)
3967{
3968	return EINVAL;
3969}
3970
3971static int
3972mxge_change_mtu(mxge_softc_t *sc, int mtu)
3973{
3974	struct ifnet *ifp = sc->ifp;
3975	int real_mtu, old_mtu;
3976	int err = 0;
3977
3978
3979	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3980	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3981		return EINVAL;
3982	mtx_lock(&sc->driver_mtx);
3983	old_mtu = ifp->if_mtu;
3984	ifp->if_mtu = mtu;
3985	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3986		mxge_close(sc, 0);
3987		err = mxge_open(sc);
3988		if (err != 0) {
3989			ifp->if_mtu = old_mtu;
3990			mxge_close(sc, 0);
3991			(void) mxge_open(sc);
3992		}
3993	}
3994	mtx_unlock(&sc->driver_mtx);
3995	return err;
3996}
3997
3998static void
3999mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4000{
4001	mxge_softc_t *sc = ifp->if_softc;
4002
4003
4004	if (sc == NULL)
4005		return;
4006	ifmr->ifm_status = IFM_AVALID;
4007	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4008	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
4009	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
4010}
4011
4012static int
4013mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4014{
4015	mxge_softc_t *sc = ifp->if_softc;
4016	struct ifreq *ifr = (struct ifreq *)data;
4017	int err, mask;
4018
4019	err = 0;
4020	switch (command) {
4021	case SIOCSIFADDR:
4022	case SIOCGIFADDR:
4023		err = ether_ioctl(ifp, command, data);
4024		break;
4025
4026	case SIOCSIFMTU:
4027		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4028		break;
4029
4030	case SIOCSIFFLAGS:
4031		mtx_lock(&sc->driver_mtx);
4032		if (sc->dying) {
4033			mtx_unlock(&sc->driver_mtx);
4034			return EINVAL;
4035		}
4036		if (ifp->if_flags & IFF_UP) {
4037			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4038				err = mxge_open(sc);
4039			} else {
4040				/* take care of promis can allmulti
4041				   flag chages */
4042				mxge_change_promisc(sc,
4043						    ifp->if_flags & IFF_PROMISC);
4044				mxge_set_multicast_list(sc);
4045			}
4046		} else {
4047			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4048				mxge_close(sc, 0);
4049			}
4050		}
4051		mtx_unlock(&sc->driver_mtx);
4052		break;
4053
4054	case SIOCADDMULTI:
4055	case SIOCDELMULTI:
4056		mtx_lock(&sc->driver_mtx);
4057		mxge_set_multicast_list(sc);
4058		mtx_unlock(&sc->driver_mtx);
4059		break;
4060
4061	case SIOCSIFCAP:
4062		mtx_lock(&sc->driver_mtx);
4063		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4064		if (mask & IFCAP_TXCSUM) {
4065			if (IFCAP_TXCSUM & ifp->if_capenable) {
4066				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4067				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4068						      | CSUM_TSO);
4069			} else {
4070				ifp->if_capenable |= IFCAP_TXCSUM;
4071				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4072			}
4073		} else if (mask & IFCAP_RXCSUM) {
4074			if (IFCAP_RXCSUM & ifp->if_capenable) {
4075				ifp->if_capenable &= ~IFCAP_RXCSUM;
4076				sc->csum_flag = 0;
4077			} else {
4078				ifp->if_capenable |= IFCAP_RXCSUM;
4079				sc->csum_flag = 1;
4080			}
4081		}
4082		if (mask & IFCAP_TSO4) {
4083			if (IFCAP_TSO4 & ifp->if_capenable) {
4084				ifp->if_capenable &= ~IFCAP_TSO4;
4085				ifp->if_hwassist &= ~CSUM_TSO;
4086			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4087				ifp->if_capenable |= IFCAP_TSO4;
4088				ifp->if_hwassist |= CSUM_TSO;
4089			} else {
4090				printf("mxge requires tx checksum offload"
4091				       " be enabled to use TSO\n");
4092				err = EINVAL;
4093			}
4094		}
4095		if (mask & IFCAP_LRO) {
4096			if (IFCAP_LRO & ifp->if_capenable)
4097				err = mxge_change_lro_locked(sc, 0);
4098			else
4099				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4100		}
4101		if (mask & IFCAP_VLAN_HWTAGGING)
4102			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4103		mtx_unlock(&sc->driver_mtx);
4104		VLAN_CAPABILITIES(ifp);
4105
4106		break;
4107
4108	case SIOCGIFMEDIA:
4109		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4110				    &sc->media, command);
4111                break;
4112
4113	default:
4114		err = ENOTTY;
4115        }
4116	return err;
4117}
4118
4119static void
4120mxge_fetch_tunables(mxge_softc_t *sc)
4121{
4122
4123	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4124	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4125			  &mxge_flow_control);
4126	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4127			  &mxge_intr_coal_delay);
4128	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4129			  &mxge_nvidia_ecrc_enable);
4130	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4131			  &mxge_force_firmware);
4132	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4133			  &mxge_deassert_wait);
4134	TUNABLE_INT_FETCH("hw.mxge.verbose",
4135			  &mxge_verbose);
4136	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4137	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4138	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4139	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4140	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4141	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4142	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4143	if (sc->lro_cnt != 0)
4144		mxge_lro_cnt = sc->lro_cnt;
4145
4146	if (bootverbose)
4147		mxge_verbose = 1;
4148	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4149		mxge_intr_coal_delay = 30;
4150	if (mxge_ticks == 0)
4151		mxge_ticks = hz / 2;
4152	sc->pause = mxge_flow_control;
4153	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4154	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4155		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4156	}
4157	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4158	    mxge_initial_mtu < ETHER_MIN_LEN)
4159		mxge_initial_mtu = ETHERMTU_JUMBO;
4160
4161	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4162		mxge_throttle = MXGE_MAX_THROTTLE;
4163	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4164		mxge_throttle = MXGE_MIN_THROTTLE;
4165	sc->throttle = mxge_throttle;
4166}
4167
4168
4169static void
4170mxge_free_slices(mxge_softc_t *sc)
4171{
4172	struct mxge_slice_state *ss;
4173	int i;
4174
4175
4176	if (sc->ss == NULL)
4177		return;
4178
4179	for (i = 0; i < sc->num_slices; i++) {
4180		ss = &sc->ss[i];
4181		if (ss->fw_stats != NULL) {
4182			mxge_dma_free(&ss->fw_stats_dma);
4183			ss->fw_stats = NULL;
4184#ifdef IFNET_BUF_RING
4185			if (ss->tx.br != NULL) {
4186				drbr_free(ss->tx.br, M_DEVBUF);
4187				ss->tx.br = NULL;
4188			}
4189#endif
4190			mtx_destroy(&ss->tx.mtx);
4191		}
4192		if (ss->rx_done.entry != NULL) {
4193			mxge_dma_free(&ss->rx_done.dma);
4194			ss->rx_done.entry = NULL;
4195		}
4196	}
4197	free(sc->ss, M_DEVBUF);
4198	sc->ss = NULL;
4199}
4200
4201static int
4202mxge_alloc_slices(mxge_softc_t *sc)
4203{
4204	mxge_cmd_t cmd;
4205	struct mxge_slice_state *ss;
4206	size_t bytes;
4207	int err, i, max_intr_slots;
4208
4209	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4210	if (err != 0) {
4211		device_printf(sc->dev, "Cannot determine rx ring size\n");
4212		return err;
4213	}
4214	sc->rx_ring_size = cmd.data0;
4215	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4216
4217	bytes = sizeof (*sc->ss) * sc->num_slices;
4218	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4219	if (sc->ss == NULL)
4220		return (ENOMEM);
4221	for (i = 0; i < sc->num_slices; i++) {
4222		ss = &sc->ss[i];
4223
4224		ss->sc = sc;
4225
4226		/* allocate per-slice rx interrupt queues */
4227
4228		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4229		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4230		if (err != 0)
4231			goto abort;
4232		ss->rx_done.entry = ss->rx_done.dma.addr;
4233		bzero(ss->rx_done.entry, bytes);
4234
4235		/*
4236		 * allocate the per-slice firmware stats; stats
4237		 * (including tx) are used used only on the first
4238		 * slice for now
4239		 */
4240#ifndef IFNET_BUF_RING
4241		if (i > 0)
4242			continue;
4243#endif
4244
4245		bytes = sizeof (*ss->fw_stats);
4246		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4247				     sizeof (*ss->fw_stats), 64);
4248		if (err != 0)
4249			goto abort;
4250		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4251		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4252			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4253		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4254#ifdef IFNET_BUF_RING
4255		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4256					   &ss->tx.mtx);
4257#endif
4258	}
4259
4260	return (0);
4261
4262abort:
4263	mxge_free_slices(sc);
4264	return (ENOMEM);
4265}
4266
4267static void
4268mxge_slice_probe(mxge_softc_t *sc)
4269{
4270	mxge_cmd_t cmd;
4271	char *old_fw;
4272	int msix_cnt, status, max_intr_slots;
4273
4274	sc->num_slices = 1;
4275	/*
4276	 *  don't enable multiple slices if they are not enabled,
4277	 *  or if this is not an SMP system
4278	 */
4279
4280	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4281		return;
4282
4283	/* see how many MSI-X interrupts are available */
4284	msix_cnt = pci_msix_count(sc->dev);
4285	if (msix_cnt < 2)
4286		return;
4287
4288	/* now load the slice aware firmware see what it supports */
4289	old_fw = sc->fw_name;
4290	if (old_fw == mxge_fw_aligned)
4291		sc->fw_name = mxge_fw_rss_aligned;
4292	else
4293		sc->fw_name = mxge_fw_rss_unaligned;
4294	status = mxge_load_firmware(sc, 0);
4295	if (status != 0) {
4296		device_printf(sc->dev, "Falling back to a single slice\n");
4297		return;
4298	}
4299
4300	/* try to send a reset command to the card to see if it
4301	   is alive */
4302	memset(&cmd, 0, sizeof (cmd));
4303	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4304	if (status != 0) {
4305		device_printf(sc->dev, "failed reset\n");
4306		goto abort_with_fw;
4307	}
4308
4309	/* get rx ring size */
4310	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4311	if (status != 0) {
4312		device_printf(sc->dev, "Cannot determine rx ring size\n");
4313		goto abort_with_fw;
4314	}
4315	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4316
4317	/* tell it the size of the interrupt queues */
4318	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4319	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4320	if (status != 0) {
4321		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4322		goto abort_with_fw;
4323	}
4324
4325	/* ask the maximum number of slices it supports */
4326	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4327	if (status != 0) {
4328		device_printf(sc->dev,
4329			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4330		goto abort_with_fw;
4331	}
4332	sc->num_slices = cmd.data0;
4333	if (sc->num_slices > msix_cnt)
4334		sc->num_slices = msix_cnt;
4335
4336	if (mxge_max_slices == -1) {
4337		/* cap to number of CPUs in system */
4338		if (sc->num_slices > mp_ncpus)
4339			sc->num_slices = mp_ncpus;
4340	} else {
4341		if (sc->num_slices > mxge_max_slices)
4342			sc->num_slices = mxge_max_slices;
4343	}
4344	/* make sure it is a power of two */
4345	while (sc->num_slices & (sc->num_slices - 1))
4346		sc->num_slices--;
4347
4348	if (mxge_verbose)
4349		device_printf(sc->dev, "using %d slices\n",
4350			      sc->num_slices);
4351
4352	return;
4353
4354abort_with_fw:
4355	sc->fw_name = old_fw;
4356	(void) mxge_load_firmware(sc, 0);
4357}
4358
4359static int
4360mxge_add_msix_irqs(mxge_softc_t *sc)
4361{
4362	size_t bytes;
4363	int count, err, i, rid;
4364
4365	rid = PCIR_BAR(2);
4366	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4367						    &rid, RF_ACTIVE);
4368
4369	if (sc->msix_table_res == NULL) {
4370		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4371		return ENXIO;
4372	}
4373
4374	count = sc->num_slices;
4375	err = pci_alloc_msix(sc->dev, &count);
4376	if (err != 0) {
4377		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4378			      "err = %d \n", sc->num_slices, err);
4379		goto abort_with_msix_table;
4380	}
4381	if (count < sc->num_slices) {
4382		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4383			      count, sc->num_slices);
4384		device_printf(sc->dev,
4385			      "Try setting hw.mxge.max_slices to %d\n",
4386			      count);
4387		err = ENOSPC;
4388		goto abort_with_msix;
4389	}
4390	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4391	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4392	if (sc->msix_irq_res == NULL) {
4393		err = ENOMEM;
4394		goto abort_with_msix;
4395	}
4396
4397	for (i = 0; i < sc->num_slices; i++) {
4398		rid = i + 1;
4399		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4400							  SYS_RES_IRQ,
4401							  &rid, RF_ACTIVE);
4402		if (sc->msix_irq_res[i] == NULL) {
4403			device_printf(sc->dev, "couldn't allocate IRQ res"
4404				      " for message %d\n", i);
4405			err = ENXIO;
4406			goto abort_with_res;
4407		}
4408	}
4409
4410	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4411	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4412
4413	for (i = 0; i < sc->num_slices; i++) {
4414		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4415				     INTR_TYPE_NET | INTR_MPSAFE,
4416#if __FreeBSD_version > 700030
4417				     NULL,
4418#endif
4419				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4420		if (err != 0) {
4421			device_printf(sc->dev, "couldn't setup intr for "
4422				      "message %d\n", i);
4423			goto abort_with_intr;
4424		}
4425	}
4426
4427	if (mxge_verbose) {
4428		device_printf(sc->dev, "using %d msix IRQs:",
4429			      sc->num_slices);
4430		for (i = 0; i < sc->num_slices; i++)
4431			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4432		printf("\n");
4433	}
4434	return (0);
4435
4436abort_with_intr:
4437	for (i = 0; i < sc->num_slices; i++) {
4438		if (sc->msix_ih[i] != NULL) {
4439			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4440					  sc->msix_ih[i]);
4441			sc->msix_ih[i] = NULL;
4442		}
4443	}
4444	free(sc->msix_ih, M_DEVBUF);
4445
4446
4447abort_with_res:
4448	for (i = 0; i < sc->num_slices; i++) {
4449		rid = i + 1;
4450		if (sc->msix_irq_res[i] != NULL)
4451			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4452					     sc->msix_irq_res[i]);
4453		sc->msix_irq_res[i] = NULL;
4454	}
4455	free(sc->msix_irq_res, M_DEVBUF);
4456
4457
4458abort_with_msix:
4459	pci_release_msi(sc->dev);
4460
4461abort_with_msix_table:
4462	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4463			     sc->msix_table_res);
4464
4465	return err;
4466}
4467
4468static int
4469mxge_add_single_irq(mxge_softc_t *sc)
4470{
4471	int count, err, rid;
4472
4473	count = pci_msi_count(sc->dev);
4474	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4475		rid = 1;
4476	} else {
4477		rid = 0;
4478		sc->legacy_irq = 1;
4479	}
4480	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4481					 1, RF_SHAREABLE | RF_ACTIVE);
4482	if (sc->irq_res == NULL) {
4483		device_printf(sc->dev, "could not alloc interrupt\n");
4484		return ENXIO;
4485	}
4486	if (mxge_verbose)
4487		device_printf(sc->dev, "using %s irq %ld\n",
4488			      sc->legacy_irq ? "INTx" : "MSI",
4489			      rman_get_start(sc->irq_res));
4490	err = bus_setup_intr(sc->dev, sc->irq_res,
4491			     INTR_TYPE_NET | INTR_MPSAFE,
4492#if __FreeBSD_version > 700030
4493			     NULL,
4494#endif
4495			     mxge_intr, &sc->ss[0], &sc->ih);
4496	if (err != 0) {
4497		bus_release_resource(sc->dev, SYS_RES_IRQ,
4498				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4499		if (!sc->legacy_irq)
4500			pci_release_msi(sc->dev);
4501	}
4502	return err;
4503}
4504
4505static void
4506mxge_rem_msix_irqs(mxge_softc_t *sc)
4507{
4508	int i, rid;
4509
4510	for (i = 0; i < sc->num_slices; i++) {
4511		if (sc->msix_ih[i] != NULL) {
4512			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4513					  sc->msix_ih[i]);
4514			sc->msix_ih[i] = NULL;
4515		}
4516	}
4517	free(sc->msix_ih, M_DEVBUF);
4518
4519	for (i = 0; i < sc->num_slices; i++) {
4520		rid = i + 1;
4521		if (sc->msix_irq_res[i] != NULL)
4522			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4523					     sc->msix_irq_res[i]);
4524		sc->msix_irq_res[i] = NULL;
4525	}
4526	free(sc->msix_irq_res, M_DEVBUF);
4527
4528	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4529			     sc->msix_table_res);
4530
4531	pci_release_msi(sc->dev);
4532	return;
4533}
4534
4535static void
4536mxge_rem_single_irq(mxge_softc_t *sc)
4537{
4538	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4539	bus_release_resource(sc->dev, SYS_RES_IRQ,
4540			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4541	if (!sc->legacy_irq)
4542		pci_release_msi(sc->dev);
4543}
4544
4545static void
4546mxge_rem_irq(mxge_softc_t *sc)
4547{
4548	if (sc->num_slices > 1)
4549		mxge_rem_msix_irqs(sc);
4550	else
4551		mxge_rem_single_irq(sc);
4552}
4553
4554static int
4555mxge_add_irq(mxge_softc_t *sc)
4556{
4557	int err;
4558
4559	if (sc->num_slices > 1)
4560		err = mxge_add_msix_irqs(sc);
4561	else
4562		err = mxge_add_single_irq(sc);
4563
4564	if (0 && err == 0 && sc->num_slices > 1) {
4565		mxge_rem_msix_irqs(sc);
4566		err = mxge_add_msix_irqs(sc);
4567	}
4568	return err;
4569}
4570
4571
4572static int
4573mxge_attach(device_t dev)
4574{
4575	mxge_softc_t *sc = device_get_softc(dev);
4576	struct ifnet *ifp;
4577	int err, rid;
4578
4579	sc->dev = dev;
4580	mxge_fetch_tunables(sc);
4581
4582	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4583	sc->tq = taskqueue_create_fast("mxge_taskq", M_WAITOK,
4584				       taskqueue_thread_enqueue,
4585				       &sc->tq);
4586	if (sc->tq == NULL) {
4587		err = ENOMEM;
4588		goto abort_with_nothing;
4589	}
4590	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4591				device_get_nameunit(sc->dev));
4592
4593	err = bus_dma_tag_create(NULL,			/* parent */
4594				 1,			/* alignment */
4595				 0,			/* boundary */
4596				 BUS_SPACE_MAXADDR,	/* low */
4597				 BUS_SPACE_MAXADDR,	/* high */
4598				 NULL, NULL,		/* filter */
4599				 65536 + 256,		/* maxsize */
4600				 MXGE_MAX_SEND_DESC, 	/* num segs */
4601				 65536,			/* maxsegsize */
4602				 0,			/* flags */
4603				 NULL, NULL,		/* lock */
4604				 &sc->parent_dmat);	/* tag */
4605
4606	if (err != 0) {
4607		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4608			      err);
4609		goto abort_with_tq;
4610	}
4611
4612	ifp = sc->ifp = if_alloc(IFT_ETHER);
4613	if (ifp == NULL) {
4614		device_printf(dev, "can not if_alloc()\n");
4615		err = ENOSPC;
4616		goto abort_with_parent_dmat;
4617	}
4618	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4619
4620	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4621		 device_get_nameunit(dev));
4622	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4623	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4624		 "%s:drv", device_get_nameunit(dev));
4625	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4626		 MTX_NETWORK_LOCK, MTX_DEF);
4627
4628	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4629
4630	mxge_setup_cfg_space(sc);
4631
4632	/* Map the board into the kernel */
4633	rid = PCIR_BARS;
4634	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4635					 ~0, 1, RF_ACTIVE);
4636	if (sc->mem_res == NULL) {
4637		device_printf(dev, "could not map memory\n");
4638		err = ENXIO;
4639		goto abort_with_lock;
4640	}
4641	sc->sram = rman_get_virtual(sc->mem_res);
4642	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4643	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4644		device_printf(dev, "impossible memory region size %ld\n",
4645			      rman_get_size(sc->mem_res));
4646		err = ENXIO;
4647		goto abort_with_mem_res;
4648	}
4649
4650	/* make NULL terminated copy of the EEPROM strings section of
4651	   lanai SRAM */
4652	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4653	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4654				rman_get_bushandle(sc->mem_res),
4655				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4656				sc->eeprom_strings,
4657				MXGE_EEPROM_STRINGS_SIZE - 2);
4658	err = mxge_parse_strings(sc);
4659	if (err != 0)
4660		goto abort_with_mem_res;
4661
4662	/* Enable write combining for efficient use of PCIe bus */
4663	mxge_enable_wc(sc);
4664
4665	/* Allocate the out of band dma memory */
4666	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4667			     sizeof (mxge_cmd_t), 64);
4668	if (err != 0)
4669		goto abort_with_mem_res;
4670	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4671	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4672	if (err != 0)
4673		goto abort_with_cmd_dma;
4674
4675	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4676	if (err != 0)
4677		goto abort_with_zeropad_dma;
4678
4679	/* select & load the firmware */
4680	err = mxge_select_firmware(sc);
4681	if (err != 0)
4682		goto abort_with_dmabench;
4683	sc->intr_coal_delay = mxge_intr_coal_delay;
4684
4685	mxge_slice_probe(sc);
4686	err = mxge_alloc_slices(sc);
4687	if (err != 0)
4688		goto abort_with_dmabench;
4689
4690	err = mxge_reset(sc, 0);
4691	if (err != 0)
4692		goto abort_with_slices;
4693
4694	err = mxge_alloc_rings(sc);
4695	if (err != 0) {
4696		device_printf(sc->dev, "failed to allocate rings\n");
4697		goto abort_with_dmabench;
4698	}
4699
4700	err = mxge_add_irq(sc);
4701	if (err != 0) {
4702		device_printf(sc->dev, "failed to add irq\n");
4703		goto abort_with_rings;
4704	}
4705
4706	ifp->if_baudrate = IF_Gbps(10UL);
4707	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4708		IFCAP_VLAN_MTU;
4709#ifdef INET
4710	ifp->if_capabilities |= IFCAP_LRO;
4711#endif
4712
4713#ifdef MXGE_NEW_VLAN_API
4714	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4715#endif
4716
4717	sc->max_mtu = mxge_max_mtu(sc);
4718	if (sc->max_mtu >= 9000)
4719		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4720	else
4721		device_printf(dev, "MTU limited to %d.  Install "
4722			      "latest firmware for 9000 byte jumbo support\n",
4723			      sc->max_mtu - ETHER_HDR_LEN);
4724	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4725	ifp->if_capenable = ifp->if_capabilities;
4726	if (sc->lro_cnt == 0)
4727		ifp->if_capenable &= ~IFCAP_LRO;
4728	sc->csum_flag = 1;
4729        ifp->if_init = mxge_init;
4730        ifp->if_softc = sc;
4731        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4732        ifp->if_ioctl = mxge_ioctl;
4733        ifp->if_start = mxge_start;
4734	/* Initialise the ifmedia structure */
4735	ifmedia_init(&sc->media, 0, mxge_media_change,
4736		     mxge_media_status);
4737	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4738	mxge_media_probe(sc);
4739	sc->dying = 0;
4740	ether_ifattach(ifp, sc->mac_addr);
4741	/* ether_ifattach sets mtu to ETHERMTU */
4742	if (mxge_initial_mtu != ETHERMTU)
4743		mxge_change_mtu(sc, mxge_initial_mtu);
4744
4745	mxge_add_sysctls(sc);
4746#ifdef IFNET_BUF_RING
4747	ifp->if_transmit = mxge_transmit;
4748	ifp->if_qflush = mxge_qflush;
4749#endif
4750	return 0;
4751
4752abort_with_rings:
4753	mxge_free_rings(sc);
4754abort_with_slices:
4755	mxge_free_slices(sc);
4756abort_with_dmabench:
4757	mxge_dma_free(&sc->dmabench_dma);
4758abort_with_zeropad_dma:
4759	mxge_dma_free(&sc->zeropad_dma);
4760abort_with_cmd_dma:
4761	mxge_dma_free(&sc->cmd_dma);
4762abort_with_mem_res:
4763	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4764abort_with_lock:
4765	pci_disable_busmaster(dev);
4766	mtx_destroy(&sc->cmd_mtx);
4767	mtx_destroy(&sc->driver_mtx);
4768	if_free(ifp);
4769abort_with_parent_dmat:
4770	bus_dma_tag_destroy(sc->parent_dmat);
4771abort_with_tq:
4772	if (sc->tq != NULL) {
4773		taskqueue_drain(sc->tq, &sc->watchdog_task);
4774		taskqueue_free(sc->tq);
4775		sc->tq = NULL;
4776	}
4777abort_with_nothing:
4778	return err;
4779}
4780
4781static int
4782mxge_detach(device_t dev)
4783{
4784	mxge_softc_t *sc = device_get_softc(dev);
4785
4786	if (mxge_vlans_active(sc)) {
4787		device_printf(sc->dev,
4788			      "Detach vlans before removing module\n");
4789		return EBUSY;
4790	}
4791	mtx_lock(&sc->driver_mtx);
4792	sc->dying = 1;
4793	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4794		mxge_close(sc, 0);
4795	mtx_unlock(&sc->driver_mtx);
4796	ether_ifdetach(sc->ifp);
4797	if (sc->tq != NULL) {
4798		taskqueue_drain(sc->tq, &sc->watchdog_task);
4799		taskqueue_free(sc->tq);
4800		sc->tq = NULL;
4801	}
4802	callout_drain(&sc->co_hdl);
4803	ifmedia_removeall(&sc->media);
4804	mxge_dummy_rdma(sc, 0);
4805	mxge_rem_sysctls(sc);
4806	mxge_rem_irq(sc);
4807	mxge_free_rings(sc);
4808	mxge_free_slices(sc);
4809	mxge_dma_free(&sc->dmabench_dma);
4810	mxge_dma_free(&sc->zeropad_dma);
4811	mxge_dma_free(&sc->cmd_dma);
4812	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4813	pci_disable_busmaster(dev);
4814	mtx_destroy(&sc->cmd_mtx);
4815	mtx_destroy(&sc->driver_mtx);
4816	if_free(sc->ifp);
4817	bus_dma_tag_destroy(sc->parent_dmat);
4818	return 0;
4819}
4820
4821static int
4822mxge_shutdown(device_t dev)
4823{
4824	return 0;
4825}
4826
4827/*
4828  This file uses Myri10GE driver indentation.
4829
4830  Local Variables:
4831  c-file-style:"linux"
4832  tab-width:8
4833  End:
4834*/
4835