if_mxge.c revision 207761
1/******************************************************************************
2
3Copyright (c) 2006-2009, Myricom Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Myricom Inc, nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/mxge/if_mxge.c 207761 2010-05-07 22:09:17Z fabient $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/linker.h>
36#include <sys/firmware.h>
37#include <sys/endian.h>
38#include <sys/sockio.h>
39#include <sys/mbuf.h>
40#include <sys/malloc.h>
41#include <sys/kdb.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/module.h>
45#include <sys/socket.h>
46#include <sys/sysctl.h>
47#include <sys/sx.h>
48#include <sys/taskqueue.h>
49
50/* count xmits ourselves, rather than via drbr */
51#define NO_SLOW_STATS
52#include <net/if.h>
53#include <net/if_arp.h>
54#include <net/ethernet.h>
55#include <net/if_dl.h>
56#include <net/if_media.h>
57
58#include <net/bpf.h>
59
60#include <net/if_types.h>
61#include <net/if_vlan_var.h>
62#include <net/zlib.h>
63
64#include <netinet/in_systm.h>
65#include <netinet/in.h>
66#include <netinet/ip.h>
67#include <netinet/tcp.h>
68
69#include <machine/bus.h>
70#include <machine/in_cksum.h>
71#include <machine/resource.h>
72#include <sys/bus.h>
73#include <sys/rman.h>
74#include <sys/smp.h>
75
76#include <dev/pci/pcireg.h>
77#include <dev/pci/pcivar.h>
78#include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
79
80#include <vm/vm.h>		/* for pmap_mapdev() */
81#include <vm/pmap.h>
82
83#if defined(__i386) || defined(__amd64)
84#include <machine/specialreg.h>
85#endif
86
87#include <dev/mxge/mxge_mcp.h>
88#include <dev/mxge/mcp_gen_header.h>
89/*#define MXGE_FAKE_IFP*/
90#include <dev/mxge/if_mxge_var.h>
91#ifdef IFNET_BUF_RING
92#include <sys/buf_ring.h>
93#endif
94
95#include "opt_inet.h"
96
97/* tunable params */
98static int mxge_nvidia_ecrc_enable = 1;
99static int mxge_force_firmware = 0;
100static int mxge_intr_coal_delay = 30;
101static int mxge_deassert_wait = 1;
102static int mxge_flow_control = 1;
103static int mxge_verbose = 0;
104static int mxge_lro_cnt = 8;
105static int mxge_ticks;
106static int mxge_max_slices = 1;
107static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
108static int mxge_always_promisc = 0;
109static int mxge_initial_mtu = ETHERMTU_JUMBO;
110static int mxge_throttle = 0;
111static char *mxge_fw_unaligned = "mxge_ethp_z8e";
112static char *mxge_fw_aligned = "mxge_eth_z8e";
113static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
114static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
115
116static int mxge_probe(device_t dev);
117static int mxge_attach(device_t dev);
118static int mxge_detach(device_t dev);
119static int mxge_shutdown(device_t dev);
120static void mxge_intr(void *arg);
121
122static device_method_t mxge_methods[] =
123{
124  /* Device interface */
125  DEVMETHOD(device_probe, mxge_probe),
126  DEVMETHOD(device_attach, mxge_attach),
127  DEVMETHOD(device_detach, mxge_detach),
128  DEVMETHOD(device_shutdown, mxge_shutdown),
129  {0, 0}
130};
131
132static driver_t mxge_driver =
133{
134  "mxge",
135  mxge_methods,
136  sizeof(mxge_softc_t),
137};
138
139static devclass_t mxge_devclass;
140
141/* Declare ourselves to be a child of the PCI bus.*/
142DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
143MODULE_DEPEND(mxge, firmware, 1, 1, 1);
144MODULE_DEPEND(mxge, zlib, 1, 1, 1);
145
146static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
147static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
148static int mxge_close(mxge_softc_t *sc, int down);
149static int mxge_open(mxge_softc_t *sc);
150static void mxge_tick(void *arg);
151
152static int
153mxge_probe(device_t dev)
154{
155	int rev;
156
157
158	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
159	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
160	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
161		rev = pci_get_revid(dev);
162		switch (rev) {
163		case MXGE_PCI_REV_Z8E:
164			device_set_desc(dev, "Myri10G-PCIE-8A");
165			break;
166		case MXGE_PCI_REV_Z8ES:
167			device_set_desc(dev, "Myri10G-PCIE-8B");
168			break;
169		default:
170			device_set_desc(dev, "Myri10G-PCIE-8??");
171			device_printf(dev, "Unrecognized rev %d NIC\n",
172				      rev);
173			break;
174		}
175		return 0;
176	}
177	return ENXIO;
178}
179
180static void
181mxge_enable_wc(mxge_softc_t *sc)
182{
183#if defined(__i386) || defined(__amd64)
184	vm_offset_t len;
185	int err;
186
187	sc->wc = 1;
188	len = rman_get_size(sc->mem_res);
189	err = pmap_change_attr((vm_offset_t) sc->sram,
190			       len, PAT_WRITE_COMBINING);
191	if (err != 0) {
192		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193			      err);
194		sc->wc = 0;
195	}
196#endif
197}
198
199
200/* callback to get our DMA address */
201static void
202mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
203			 int error)
204{
205	if (error == 0) {
206		*(bus_addr_t *) arg = segs->ds_addr;
207	}
208}
209
210static int
211mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
212		   bus_size_t alignment)
213{
214	int err;
215	device_t dev = sc->dev;
216	bus_size_t boundary, maxsegsize;
217
218	if (bytes > 4096 && alignment == 4096) {
219		boundary = 0;
220		maxsegsize = bytes;
221	} else {
222		boundary = 4096;
223		maxsegsize = 4096;
224	}
225
226	/* allocate DMAable memory tags */
227	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
228				 alignment,		/* alignment */
229				 boundary,		/* boundary */
230				 BUS_SPACE_MAXADDR,	/* low */
231				 BUS_SPACE_MAXADDR,	/* high */
232				 NULL, NULL,		/* filter */
233				 bytes,			/* maxsize */
234				 1,			/* num segs */
235				 maxsegsize,		/* maxsegsize */
236				 BUS_DMA_COHERENT,	/* flags */
237				 NULL, NULL,		/* lock */
238				 &dma->dmat);		/* tag */
239	if (err != 0) {
240		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
241		return err;
242	}
243
244	/* allocate DMAable memory & map */
245	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
246			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
247				| BUS_DMA_ZERO),  &dma->map);
248	if (err != 0) {
249		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
250		goto abort_with_dmat;
251	}
252
253	/* load the memory */
254	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
255			      mxge_dmamap_callback,
256			      (void *)&dma->bus_addr, 0);
257	if (err != 0) {
258		device_printf(dev, "couldn't load map (err = %d)\n", err);
259		goto abort_with_mem;
260	}
261	return 0;
262
263abort_with_mem:
264	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
265abort_with_dmat:
266	(void)bus_dma_tag_destroy(dma->dmat);
267	return err;
268}
269
270
271static void
272mxge_dma_free(mxge_dma_t *dma)
273{
274	bus_dmamap_unload(dma->dmat, dma->map);
275	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
276	(void)bus_dma_tag_destroy(dma->dmat);
277}
278
279/*
280 * The eeprom strings on the lanaiX have the format
281 * SN=x\0
282 * MAC=x:x:x:x:x:x\0
283 * PC=text\0
284 */
285
286static int
287mxge_parse_strings(mxge_softc_t *sc)
288{
289#define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
290
291	char *ptr, *limit;
292	int i, found_mac;
293
294	ptr = sc->eeprom_strings;
295	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
296	found_mac = 0;
297	while (ptr < limit && *ptr != '\0') {
298		if (memcmp(ptr, "MAC=", 4) == 0) {
299			ptr += 1;
300			sc->mac_addr_string = ptr;
301			for (i = 0; i < 6; i++) {
302				ptr += 3;
303				if ((ptr + 2) > limit)
304					goto abort;
305				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
306				found_mac = 1;
307			}
308		} else if (memcmp(ptr, "PC=", 3) == 0) {
309			ptr += 3;
310			strncpy(sc->product_code_string, ptr,
311				sizeof (sc->product_code_string) - 1);
312		} else if (memcmp(ptr, "SN=", 3) == 0) {
313			ptr += 3;
314			strncpy(sc->serial_number_string, ptr,
315				sizeof (sc->serial_number_string) - 1);
316		}
317		MXGE_NEXT_STRING(ptr);
318	}
319
320	if (found_mac)
321		return 0;
322
323 abort:
324	device_printf(sc->dev, "failed to parse eeprom_strings\n");
325
326	return ENXIO;
327}
328
329#if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
330static void
331mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
332{
333	uint32_t val;
334	unsigned long base, off;
335	char *va, *cfgptr;
336	device_t pdev, mcp55;
337	uint16_t vendor_id, device_id, word;
338	uintptr_t bus, slot, func, ivend, idev;
339	uint32_t *ptr32;
340
341
342	if (!mxge_nvidia_ecrc_enable)
343		return;
344
345	pdev = device_get_parent(device_get_parent(sc->dev));
346	if (pdev == NULL) {
347		device_printf(sc->dev, "could not find parent?\n");
348		return;
349	}
350	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
351	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
352
353	if (vendor_id != 0x10de)
354		return;
355
356	base = 0;
357
358	if (device_id == 0x005d) {
359		/* ck804, base address is magic */
360		base = 0xe0000000UL;
361	} else if (device_id >= 0x0374 && device_id <= 0x378) {
362		/* mcp55, base address stored in chipset */
363		mcp55 = pci_find_bsf(0, 0, 0);
364		if (mcp55 &&
365		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
366		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
367			word = pci_read_config(mcp55, 0x90, 2);
368			base = ((unsigned long)word & 0x7ffeU) << 25;
369		}
370	}
371	if (!base)
372		return;
373
374	/* XXXX
375	   Test below is commented because it is believed that doing
376	   config read/write beyond 0xff will access the config space
377	   for the next larger function.  Uncomment this and remove
378	   the hacky pmap_mapdev() way of accessing config space when
379	   FreeBSD grows support for extended pcie config space access
380	*/
381#if 0
382	/* See if we can, by some miracle, access the extended
383	   config space */
384	val = pci_read_config(pdev, 0x178, 4);
385	if (val != 0xffffffff) {
386		val |= 0x40;
387		pci_write_config(pdev, 0x178, val, 4);
388		return;
389	}
390#endif
391	/* Rather than using normal pci config space writes, we must
392	 * map the Nvidia config space ourselves.  This is because on
393	 * opteron/nvidia class machine the 0xe000000 mapping is
394	 * handled by the nvidia chipset, that means the internal PCI
395	 * device (the on-chip northbridge), or the amd-8131 bridge
396	 * and things behind them are not visible by this method.
397	 */
398
399	BUS_READ_IVAR(device_get_parent(pdev), pdev,
400		      PCI_IVAR_BUS, &bus);
401	BUS_READ_IVAR(device_get_parent(pdev), pdev,
402		      PCI_IVAR_SLOT, &slot);
403	BUS_READ_IVAR(device_get_parent(pdev), pdev,
404		      PCI_IVAR_FUNCTION, &func);
405	BUS_READ_IVAR(device_get_parent(pdev), pdev,
406		      PCI_IVAR_VENDOR, &ivend);
407	BUS_READ_IVAR(device_get_parent(pdev), pdev,
408		      PCI_IVAR_DEVICE, &idev);
409
410	off =  base
411		+ 0x00100000UL * (unsigned long)bus
412		+ 0x00001000UL * (unsigned long)(func
413						 + 8 * slot);
414
415	/* map it into the kernel */
416	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
417
418
419	if (va == NULL) {
420		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
421		return;
422	}
423	/* get a pointer to the config space mapped into the kernel */
424	cfgptr = va + (off & PAGE_MASK);
425
426	/* make sure that we can really access it */
427	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
428	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
429	if (! (vendor_id == ivend && device_id == idev)) {
430		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
431			      vendor_id, device_id);
432		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
433		return;
434	}
435
436	ptr32 = (uint32_t*)(cfgptr + 0x178);
437	val = *ptr32;
438
439	if (val == 0xffffffff) {
440		device_printf(sc->dev, "extended mapping failed\n");
441		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
442		return;
443	}
444	*ptr32 = val | 0x40;
445	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
446	if (mxge_verbose)
447		device_printf(sc->dev,
448			      "Enabled ECRC on upstream Nvidia bridge "
449			      "at %d:%d:%d\n",
450			      (int)bus, (int)slot, (int)func);
451	return;
452}
453#else
454static void
455mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
456{
457	device_printf(sc->dev,
458		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
459	return;
460}
461#endif
462
463
464static int
465mxge_dma_test(mxge_softc_t *sc, int test_type)
466{
467	mxge_cmd_t cmd;
468	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
469	int status;
470	uint32_t len;
471	char *test = " ";
472
473
474	/* Run a small DMA test.
475	 * The magic multipliers to the length tell the firmware
476	 * to do DMA read, write, or read+write tests.  The
477	 * results are returned in cmd.data0.  The upper 16
478	 * bits of the return is the number of transfers completed.
479	 * The lower 16 bits is the time in 0.5us ticks that the
480	 * transfers took to complete.
481	 */
482
483	len = sc->tx_boundary;
484
485	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
486	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
487	cmd.data2 = len * 0x10000;
488	status = mxge_send_cmd(sc, test_type, &cmd);
489	if (status != 0) {
490		test = "read";
491		goto abort;
492	}
493	sc->read_dma = ((cmd.data0>>16) * len * 2) /
494		(cmd.data0 & 0xffff);
495	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
496	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
497	cmd.data2 = len * 0x1;
498	status = mxge_send_cmd(sc, test_type, &cmd);
499	if (status != 0) {
500		test = "write";
501		goto abort;
502	}
503	sc->write_dma = ((cmd.data0>>16) * len * 2) /
504		(cmd.data0 & 0xffff);
505
506	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508	cmd.data2 = len * 0x10001;
509	status = mxge_send_cmd(sc, test_type, &cmd);
510	if (status != 0) {
511		test = "read/write";
512		goto abort;
513	}
514	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
515		(cmd.data0 & 0xffff);
516
517abort:
518	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
519		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
520			      test, status);
521
522	return status;
523}
524
525/*
526 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
527 * when the PCI-E Completion packets are aligned on an 8-byte
528 * boundary.  Some PCI-E chip sets always align Completion packets; on
529 * the ones that do not, the alignment can be enforced by enabling
530 * ECRC generation (if supported).
531 *
532 * When PCI-E Completion packets are not aligned, it is actually more
533 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
534 *
535 * If the driver can neither enable ECRC nor verify that it has
536 * already been enabled, then it must use a firmware image which works
537 * around unaligned completion packets (ethp_z8e.dat), and it should
538 * also ensure that it never gives the device a Read-DMA which is
539 * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
540 * enabled, then the driver should use the aligned (eth_z8e.dat)
541 * firmware image, and set tx_boundary to 4KB.
542 */
543
544static int
545mxge_firmware_probe(mxge_softc_t *sc)
546{
547	device_t dev = sc->dev;
548	int reg, status;
549	uint16_t pectl;
550
551	sc->tx_boundary = 4096;
552	/*
553	 * Verify the max read request size was set to 4KB
554	 * before trying the test with 4KB.
555	 */
556	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
557		pectl = pci_read_config(dev, reg + 0x8, 2);
558		if ((pectl & (5 << 12)) != (5 << 12)) {
559			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
560				      pectl);
561			sc->tx_boundary = 2048;
562		}
563	}
564
565	/*
566	 * load the optimized firmware (which assumes aligned PCIe
567	 * completions) in order to see if it works on this host.
568	 */
569	sc->fw_name = mxge_fw_aligned;
570	status = mxge_load_firmware(sc, 1);
571	if (status != 0) {
572		return status;
573	}
574
575	/*
576	 * Enable ECRC if possible
577	 */
578	mxge_enable_nvidia_ecrc(sc);
579
580	/*
581	 * Run a DMA test which watches for unaligned completions and
582	 * aborts on the first one seen.
583	 */
584
585	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
586	if (status == 0)
587		return 0; /* keep the aligned firmware */
588
589	if (status != E2BIG)
590		device_printf(dev, "DMA test failed: %d\n", status);
591	if (status == ENOSYS)
592		device_printf(dev, "Falling back to ethp! "
593			      "Please install up to date fw\n");
594	return status;
595}
596
597static int
598mxge_select_firmware(mxge_softc_t *sc)
599{
600	int aligned = 0;
601	int force_firmware = mxge_force_firmware;
602
603	if (sc->throttle)
604		force_firmware = sc->throttle;
605
606	if (force_firmware != 0) {
607		if (force_firmware == 1)
608			aligned = 1;
609		else
610			aligned = 0;
611		if (mxge_verbose)
612			device_printf(sc->dev,
613				      "Assuming %s completions (forced)\n",
614				      aligned ? "aligned" : "unaligned");
615		goto abort;
616	}
617
618	/* if the PCIe link width is 4 or less, we can use the aligned
619	   firmware and skip any checks */
620	if (sc->link_width != 0 && sc->link_width <= 4) {
621		device_printf(sc->dev,
622			      "PCIe x%d Link, expect reduced performance\n",
623			      sc->link_width);
624		aligned = 1;
625		goto abort;
626	}
627
628	if (0 == mxge_firmware_probe(sc))
629		return 0;
630
631abort:
632	if (aligned) {
633		sc->fw_name = mxge_fw_aligned;
634		sc->tx_boundary = 4096;
635	} else {
636		sc->fw_name = mxge_fw_unaligned;
637		sc->tx_boundary = 2048;
638	}
639	return (mxge_load_firmware(sc, 0));
640}
641
642union qualhack
643{
644        const char *ro_char;
645        char *rw_char;
646};
647
648static int
649mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
650{
651
652
653	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
654		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
655			      be32toh(hdr->mcp_type));
656		return EIO;
657	}
658
659	/* save firmware version for sysctl */
660	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
661	if (mxge_verbose)
662		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
663
664	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
665	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
666
667	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
668	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
669		device_printf(sc->dev, "Found firmware version %s\n",
670			      sc->fw_version);
671		device_printf(sc->dev, "Driver needs %d.%d\n",
672			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
673		return EINVAL;
674	}
675	return 0;
676
677}
678
679static void *
680z_alloc(void *nil, u_int items, u_int size)
681{
682        void *ptr;
683
684        ptr = malloc(items * size, M_TEMP, M_NOWAIT);
685        return ptr;
686}
687
688static void
689z_free(void *nil, void *ptr)
690{
691        free(ptr, M_TEMP);
692}
693
694
695static int
696mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
697{
698	z_stream zs;
699	char *inflate_buffer;
700	const struct firmware *fw;
701	const mcp_gen_header_t *hdr;
702	unsigned hdr_offset;
703	int status;
704	unsigned int i;
705	char dummy;
706	size_t fw_len;
707
708	fw = firmware_get(sc->fw_name);
709	if (fw == NULL) {
710		device_printf(sc->dev, "Could not find firmware image %s\n",
711			      sc->fw_name);
712		return ENOENT;
713	}
714
715
716
717	/* setup zlib and decompress f/w */
718	bzero(&zs, sizeof (zs));
719	zs.zalloc = z_alloc;
720	zs.zfree = z_free;
721	status = inflateInit(&zs);
722	if (status != Z_OK) {
723		status = EIO;
724		goto abort_with_fw;
725	}
726
727	/* the uncompressed size is stored as the firmware version,
728	   which would otherwise go unused */
729	fw_len = (size_t) fw->version;
730	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
731	if (inflate_buffer == NULL)
732		goto abort_with_zs;
733	zs.avail_in = fw->datasize;
734	zs.next_in = __DECONST(char *, fw->data);
735	zs.avail_out = fw_len;
736	zs.next_out = inflate_buffer;
737	status = inflate(&zs, Z_FINISH);
738	if (status != Z_STREAM_END) {
739		device_printf(sc->dev, "zlib %d\n", status);
740		status = EIO;
741		goto abort_with_buffer;
742	}
743
744	/* check id */
745	hdr_offset = htobe32(*(const uint32_t *)
746			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
747	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
748		device_printf(sc->dev, "Bad firmware file");
749		status = EIO;
750		goto abort_with_buffer;
751	}
752	hdr = (const void*)(inflate_buffer + hdr_offset);
753
754	status = mxge_validate_firmware(sc, hdr);
755	if (status != 0)
756		goto abort_with_buffer;
757
758	/* Copy the inflated firmware to NIC SRAM. */
759	for (i = 0; i < fw_len; i += 256) {
760		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
761			      inflate_buffer + i,
762			      min(256U, (unsigned)(fw_len - i)));
763		wmb();
764		dummy = *sc->sram;
765		wmb();
766	}
767
768	*limit = fw_len;
769	status = 0;
770abort_with_buffer:
771	free(inflate_buffer, M_TEMP);
772abort_with_zs:
773	inflateEnd(&zs);
774abort_with_fw:
775	firmware_put(fw, FIRMWARE_UNLOAD);
776	return status;
777}
778
779/*
780 * Enable or disable periodic RDMAs from the host to make certain
781 * chipsets resend dropped PCIe messages
782 */
783
784static void
785mxge_dummy_rdma(mxge_softc_t *sc, int enable)
786{
787	char buf_bytes[72];
788	volatile uint32_t *confirm;
789	volatile char *submit;
790	uint32_t *buf, dma_low, dma_high;
791	int i;
792
793	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
794
795	/* clear confirmation addr */
796	confirm = (volatile uint32_t *)sc->cmd;
797	*confirm = 0;
798	wmb();
799
800	/* send an rdma command to the PCIe engine, and wait for the
801	   response in the confirmation address.  The firmware should
802	   write a -1 there to indicate it is alive and well
803	*/
804
805	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
806	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
807	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
808	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
809	buf[2] = htobe32(0xffffffff);		/* confirm data */
810	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
811	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
812	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
813	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
814	buf[5] = htobe32(enable);			/* enable? */
815
816
817	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
818
819	mxge_pio_copy(submit, buf, 64);
820	wmb();
821	DELAY(1000);
822	wmb();
823	i = 0;
824	while (*confirm != 0xffffffff && i < 20) {
825		DELAY(1000);
826		i++;
827	}
828	if (*confirm != 0xffffffff) {
829		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
830			      (enable ? "enable" : "disable"), confirm,
831			      *confirm);
832	}
833	return;
834}
835
836static int
837mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
838{
839	mcp_cmd_t *buf;
840	char buf_bytes[sizeof(*buf) + 8];
841	volatile mcp_cmd_response_t *response = sc->cmd;
842	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
843	uint32_t dma_low, dma_high;
844	int err, sleep_total = 0;
845
846	/* ensure buf is aligned to 8 bytes */
847	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
848
849	buf->data0 = htobe32(data->data0);
850	buf->data1 = htobe32(data->data1);
851	buf->data2 = htobe32(data->data2);
852	buf->cmd = htobe32(cmd);
853	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
854	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
855
856	buf->response_addr.low = htobe32(dma_low);
857	buf->response_addr.high = htobe32(dma_high);
858	mtx_lock(&sc->cmd_mtx);
859	response->result = 0xffffffff;
860	wmb();
861	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
862
863	/* wait up to 20ms */
864	err = EAGAIN;
865	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
866		bus_dmamap_sync(sc->cmd_dma.dmat,
867				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
868		wmb();
869		switch (be32toh(response->result)) {
870		case 0:
871			data->data0 = be32toh(response->data);
872			err = 0;
873			break;
874		case 0xffffffff:
875			DELAY(1000);
876			break;
877		case MXGEFW_CMD_UNKNOWN:
878			err = ENOSYS;
879			break;
880		case MXGEFW_CMD_ERROR_UNALIGNED:
881			err = E2BIG;
882			break;
883		case MXGEFW_CMD_ERROR_BUSY:
884			err = EBUSY;
885			break;
886		case MXGEFW_CMD_ERROR_I2C_ABSENT:
887			err = ENXIO;
888			break;
889		default:
890			device_printf(sc->dev,
891				      "mxge: command %d "
892				      "failed, result = %d\n",
893				      cmd, be32toh(response->result));
894			err = ENXIO;
895			break;
896		}
897		if (err != EAGAIN)
898			break;
899	}
900	if (err == EAGAIN)
901		device_printf(sc->dev, "mxge: command %d timed out"
902			      "result = %d\n",
903			      cmd, be32toh(response->result));
904	mtx_unlock(&sc->cmd_mtx);
905	return err;
906}
907
908static int
909mxge_adopt_running_firmware(mxge_softc_t *sc)
910{
911	struct mcp_gen_header *hdr;
912	const size_t bytes = sizeof (struct mcp_gen_header);
913	size_t hdr_offset;
914	int status;
915
916	/* find running firmware header */
917	hdr_offset = htobe32(*(volatile uint32_t *)
918			     (sc->sram + MCP_HEADER_PTR_OFFSET));
919
920	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
921		device_printf(sc->dev,
922			      "Running firmware has bad header offset (%d)\n",
923			      (int)hdr_offset);
924		return EIO;
925	}
926
927	/* copy header of running firmware from SRAM to host memory to
928	 * validate firmware */
929	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
930	if (hdr == NULL) {
931		device_printf(sc->dev, "could not malloc firmware hdr\n");
932		return ENOMEM;
933	}
934	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
935				rman_get_bushandle(sc->mem_res),
936				hdr_offset, (char *)hdr, bytes);
937	status = mxge_validate_firmware(sc, hdr);
938	free(hdr, M_DEVBUF);
939
940	/*
941	 * check to see if adopted firmware has bug where adopting
942	 * it will cause broadcasts to be filtered unless the NIC
943	 * is kept in ALLMULTI mode
944	 */
945	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
946	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
947		sc->adopted_rx_filter_bug = 1;
948		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
949			      "working around rx filter bug\n",
950			      sc->fw_ver_major, sc->fw_ver_minor,
951			      sc->fw_ver_tiny);
952	}
953
954	return status;
955}
956
957
958static int
959mxge_load_firmware(mxge_softc_t *sc, int adopt)
960{
961	volatile uint32_t *confirm;
962	volatile char *submit;
963	char buf_bytes[72];
964	uint32_t *buf, size, dma_low, dma_high;
965	int status, i;
966
967	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
968
969	size = sc->sram_size;
970	status = mxge_load_firmware_helper(sc, &size);
971	if (status) {
972		if (!adopt)
973			return status;
974		/* Try to use the currently running firmware, if
975		   it is new enough */
976		status = mxge_adopt_running_firmware(sc);
977		if (status) {
978			device_printf(sc->dev,
979				      "failed to adopt running firmware\n");
980			return status;
981		}
982		device_printf(sc->dev,
983			      "Successfully adopted running firmware\n");
984		if (sc->tx_boundary == 4096) {
985			device_printf(sc->dev,
986				"Using firmware currently running on NIC"
987				 ".  For optimal\n");
988			device_printf(sc->dev,
989				 "performance consider loading optimized "
990				 "firmware\n");
991		}
992		sc->fw_name = mxge_fw_unaligned;
993		sc->tx_boundary = 2048;
994		return 0;
995	}
996	/* clear confirmation addr */
997	confirm = (volatile uint32_t *)sc->cmd;
998	*confirm = 0;
999	wmb();
1000	/* send a reload command to the bootstrap MCP, and wait for the
1001	   response in the confirmation address.  The firmware should
1002	   write a -1 there to indicate it is alive and well
1003	*/
1004
1005	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1006	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1007
1008	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1009	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1010	buf[2] = htobe32(0xffffffff);	/* confirm data */
1011
1012	/* FIX: All newest firmware should un-protect the bottom of
1013	   the sram before handoff. However, the very first interfaces
1014	   do not. Therefore the handoff copy must skip the first 8 bytes
1015	*/
1016					/* where the code starts*/
1017	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1018	buf[4] = htobe32(size - 8); 	/* length of code */
1019	buf[5] = htobe32(8);		/* where to copy to */
1020	buf[6] = htobe32(0);		/* where to jump to */
1021
1022	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1023	mxge_pio_copy(submit, buf, 64);
1024	wmb();
1025	DELAY(1000);
1026	wmb();
1027	i = 0;
1028	while (*confirm != 0xffffffff && i < 20) {
1029		DELAY(1000*10);
1030		i++;
1031		bus_dmamap_sync(sc->cmd_dma.dmat,
1032				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1033	}
1034	if (*confirm != 0xffffffff) {
1035		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1036			confirm, *confirm);
1037
1038		return ENXIO;
1039	}
1040	return 0;
1041}
1042
1043static int
1044mxge_update_mac_address(mxge_softc_t *sc)
1045{
1046	mxge_cmd_t cmd;
1047	uint8_t *addr = sc->mac_addr;
1048	int status;
1049
1050
1051	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1052		     | (addr[2] << 8) | addr[3]);
1053
1054	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1055
1056	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1057	return status;
1058}
1059
1060static int
1061mxge_change_pause(mxge_softc_t *sc, int pause)
1062{
1063	mxge_cmd_t cmd;
1064	int status;
1065
1066	if (pause)
1067		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1068				       &cmd);
1069	else
1070		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1071				       &cmd);
1072
1073	if (status) {
1074		device_printf(sc->dev, "Failed to set flow control mode\n");
1075		return ENXIO;
1076	}
1077	sc->pause = pause;
1078	return 0;
1079}
1080
1081static void
1082mxge_change_promisc(mxge_softc_t *sc, int promisc)
1083{
1084	mxge_cmd_t cmd;
1085	int status;
1086
1087	if (mxge_always_promisc)
1088		promisc = 1;
1089
1090	if (promisc)
1091		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1092				       &cmd);
1093	else
1094		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1095				       &cmd);
1096
1097	if (status) {
1098		device_printf(sc->dev, "Failed to set promisc mode\n");
1099	}
1100}
1101
1102static void
1103mxge_set_multicast_list(mxge_softc_t *sc)
1104{
1105	mxge_cmd_t cmd;
1106	struct ifmultiaddr *ifma;
1107	struct ifnet *ifp = sc->ifp;
1108	int err;
1109
1110	/* This firmware is known to not support multicast */
1111	if (!sc->fw_multicast_support)
1112		return;
1113
1114	/* Disable multicast filtering while we play with the lists*/
1115	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1116	if (err != 0) {
1117		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1118		       " error status: %d\n", err);
1119		return;
1120	}
1121
1122	if (sc->adopted_rx_filter_bug)
1123		return;
1124
1125	if (ifp->if_flags & IFF_ALLMULTI)
1126		/* request to disable multicast filtering, so quit here */
1127		return;
1128
1129	/* Flush all the filters */
1130
1131	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1132	if (err != 0) {
1133		device_printf(sc->dev,
1134			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1135			      ", error status: %d\n", err);
1136		return;
1137	}
1138
1139	/* Walk the multicast list, and add each address */
1140
1141	if_maddr_rlock(ifp);
1142	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1143		if (ifma->ifma_addr->sa_family != AF_LINK)
1144			continue;
1145		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1146		      &cmd.data0, 4);
1147		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1148		      &cmd.data1, 2);
1149		cmd.data0 = htonl(cmd.data0);
1150		cmd.data1 = htonl(cmd.data1);
1151		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1152		if (err != 0) {
1153			device_printf(sc->dev, "Failed "
1154			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1155			       "%d\t", err);
1156			/* abort, leaving multicast filtering off */
1157			if_maddr_runlock(ifp);
1158			return;
1159		}
1160	}
1161	if_maddr_runlock(ifp);
1162	/* Enable multicast filtering */
1163	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1164	if (err != 0) {
1165		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1166		       ", error status: %d\n", err);
1167	}
1168}
1169
1170static int
1171mxge_max_mtu(mxge_softc_t *sc)
1172{
1173	mxge_cmd_t cmd;
1174	int status;
1175
1176	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1177		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1178
1179	/* try to set nbufs to see if it we can
1180	   use virtually contiguous jumbos */
1181	cmd.data0 = 0;
1182	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1183			       &cmd);
1184	if (status == 0)
1185		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1186
1187	/* otherwise, we're limited to MJUMPAGESIZE */
1188	return MJUMPAGESIZE - MXGEFW_PAD;
1189}
1190
1191static int
1192mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1193{
1194	struct mxge_slice_state *ss;
1195	mxge_rx_done_t *rx_done;
1196	volatile uint32_t *irq_claim;
1197	mxge_cmd_t cmd;
1198	int slice, status;
1199
1200	/* try to send a reset command to the card to see if it
1201	   is alive */
1202	memset(&cmd, 0, sizeof (cmd));
1203	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1204	if (status != 0) {
1205		device_printf(sc->dev, "failed reset\n");
1206		return ENXIO;
1207	}
1208
1209	mxge_dummy_rdma(sc, 1);
1210
1211
1212	/* set the intrq size */
1213	cmd.data0 = sc->rx_ring_size;
1214	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1215
1216	/*
1217	 * Even though we already know how many slices are supported
1218	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1219	 * has magic side effects, and must be called after a reset.
1220	 * It must be called prior to calling any RSS related cmds,
1221	 * including assigning an interrupt queue for anything but
1222	 * slice 0.  It must also be called *after*
1223	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1224	 * the firmware to compute offsets.
1225	 */
1226
1227	if (sc->num_slices > 1) {
1228		/* ask the maximum number of slices it supports */
1229		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1230					   &cmd);
1231		if (status != 0) {
1232			device_printf(sc->dev,
1233				      "failed to get number of slices\n");
1234			return status;
1235		}
1236		/*
1237		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1238		 * to setting up the interrupt queue DMA
1239		 */
1240		cmd.data0 = sc->num_slices;
1241		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1242#ifdef IFNET_BUF_RING
1243		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1244#endif
1245		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1246					   &cmd);
1247		if (status != 0) {
1248			device_printf(sc->dev,
1249				      "failed to set number of slices\n");
1250			return status;
1251		}
1252	}
1253
1254
1255	if (interrupts_setup) {
1256		/* Now exchange information about interrupts  */
1257		for (slice = 0; slice < sc->num_slices; slice++) {
1258			rx_done = &sc->ss[slice].rx_done;
1259			memset(rx_done->entry, 0, sc->rx_ring_size);
1260			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1261			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1262			cmd.data2 = slice;
1263			status |= mxge_send_cmd(sc,
1264						MXGEFW_CMD_SET_INTRQ_DMA,
1265						&cmd);
1266		}
1267	}
1268
1269	status |= mxge_send_cmd(sc,
1270				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1271
1272
1273	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1274
1275	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1276	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1277
1278
1279	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1280				&cmd);
1281	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1282	if (status != 0) {
1283		device_printf(sc->dev, "failed set interrupt parameters\n");
1284		return status;
1285	}
1286
1287
1288	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1289
1290
1291	/* run a DMA benchmark */
1292	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1293
1294	for (slice = 0; slice < sc->num_slices; slice++) {
1295		ss = &sc->ss[slice];
1296
1297		ss->irq_claim = irq_claim + (2 * slice);
1298		/* reset mcp/driver shared state back to 0 */
1299		ss->rx_done.idx = 0;
1300		ss->rx_done.cnt = 0;
1301		ss->tx.req = 0;
1302		ss->tx.done = 0;
1303		ss->tx.pkt_done = 0;
1304		ss->tx.queue_active = 0;
1305		ss->tx.activate = 0;
1306		ss->tx.deactivate = 0;
1307		ss->tx.wake = 0;
1308		ss->tx.defrag = 0;
1309		ss->tx.stall = 0;
1310		ss->rx_big.cnt = 0;
1311		ss->rx_small.cnt = 0;
1312		ss->lro_bad_csum = 0;
1313		ss->lro_queued = 0;
1314		ss->lro_flushed = 0;
1315		if (ss->fw_stats != NULL) {
1316			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1317		}
1318	}
1319	sc->rdma_tags_available = 15;
1320	status = mxge_update_mac_address(sc);
1321	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1322	mxge_change_pause(sc, sc->pause);
1323	mxge_set_multicast_list(sc);
1324	if (sc->throttle) {
1325		cmd.data0 = sc->throttle;
1326		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1327				  &cmd)) {
1328			device_printf(sc->dev,
1329				      "can't enable throttle\n");
1330		}
1331	}
1332	return status;
1333}
1334
1335static int
1336mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1337{
1338	mxge_cmd_t cmd;
1339	mxge_softc_t *sc;
1340	int err;
1341	unsigned int throttle;
1342
1343	sc = arg1;
1344	throttle = sc->throttle;
1345	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1346        if (err != 0) {
1347                return err;
1348        }
1349
1350	if (throttle == sc->throttle)
1351		return 0;
1352
1353        if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1354                return EINVAL;
1355
1356	mtx_lock(&sc->driver_mtx);
1357	cmd.data0 = throttle;
1358	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1359	if (err == 0)
1360		sc->throttle = throttle;
1361	mtx_unlock(&sc->driver_mtx);
1362	return err;
1363}
1364
1365static int
1366mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1367{
1368        mxge_softc_t *sc;
1369        unsigned int intr_coal_delay;
1370        int err;
1371
1372        sc = arg1;
1373        intr_coal_delay = sc->intr_coal_delay;
1374        err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1375        if (err != 0) {
1376                return err;
1377        }
1378        if (intr_coal_delay == sc->intr_coal_delay)
1379                return 0;
1380
1381        if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1382                return EINVAL;
1383
1384	mtx_lock(&sc->driver_mtx);
1385	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1386	sc->intr_coal_delay = intr_coal_delay;
1387
1388	mtx_unlock(&sc->driver_mtx);
1389        return err;
1390}
1391
1392static int
1393mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1394{
1395        mxge_softc_t *sc;
1396        unsigned int enabled;
1397        int err;
1398
1399        sc = arg1;
1400        enabled = sc->pause;
1401        err = sysctl_handle_int(oidp, &enabled, arg2, req);
1402        if (err != 0) {
1403                return err;
1404        }
1405        if (enabled == sc->pause)
1406                return 0;
1407
1408	mtx_lock(&sc->driver_mtx);
1409	err = mxge_change_pause(sc, enabled);
1410	mtx_unlock(&sc->driver_mtx);
1411        return err;
1412}
1413
1414static int
1415mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1416{
1417	struct ifnet *ifp;
1418	int err = 0;
1419
1420	ifp = sc->ifp;
1421	if (lro_cnt == 0)
1422		ifp->if_capenable &= ~IFCAP_LRO;
1423	else
1424		ifp->if_capenable |= IFCAP_LRO;
1425	sc->lro_cnt = lro_cnt;
1426	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1427		mxge_close(sc, 0);
1428		err = mxge_open(sc);
1429	}
1430	return err;
1431}
1432
1433static int
1434mxge_change_lro(SYSCTL_HANDLER_ARGS)
1435{
1436	mxge_softc_t *sc;
1437	unsigned int lro_cnt;
1438	int err;
1439
1440	sc = arg1;
1441	lro_cnt = sc->lro_cnt;
1442	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1443	if (err != 0)
1444		return err;
1445
1446	if (lro_cnt == sc->lro_cnt)
1447		return 0;
1448
1449	if (lro_cnt > 128)
1450		return EINVAL;
1451
1452	mtx_lock(&sc->driver_mtx);
1453	err = mxge_change_lro_locked(sc, lro_cnt);
1454	mtx_unlock(&sc->driver_mtx);
1455	return err;
1456}
1457
1458static int
1459mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1460{
1461        int err;
1462
1463        if (arg1 == NULL)
1464                return EFAULT;
1465        arg2 = be32toh(*(int *)arg1);
1466        arg1 = NULL;
1467        err = sysctl_handle_int(oidp, arg1, arg2, req);
1468
1469        return err;
1470}
1471
1472static void
1473mxge_rem_sysctls(mxge_softc_t *sc)
1474{
1475	struct mxge_slice_state *ss;
1476	int slice;
1477
1478	if (sc->slice_sysctl_tree == NULL)
1479		return;
1480
1481	for (slice = 0; slice < sc->num_slices; slice++) {
1482		ss = &sc->ss[slice];
1483		if (ss == NULL || ss->sysctl_tree == NULL)
1484			continue;
1485		sysctl_ctx_free(&ss->sysctl_ctx);
1486		ss->sysctl_tree = NULL;
1487	}
1488	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1489	sc->slice_sysctl_tree = NULL;
1490}
1491
1492static void
1493mxge_add_sysctls(mxge_softc_t *sc)
1494{
1495	struct sysctl_ctx_list *ctx;
1496	struct sysctl_oid_list *children;
1497	mcp_irq_data_t *fw;
1498	struct mxge_slice_state *ss;
1499	int slice;
1500	char slice_num[8];
1501
1502	ctx = device_get_sysctl_ctx(sc->dev);
1503	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1504	fw = sc->ss[0].fw_stats;
1505
1506	/* random information */
1507	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1508		       "firmware_version",
1509		       CTLFLAG_RD, &sc->fw_version,
1510		       0, "firmware version");
1511	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1512		       "serial_number",
1513		       CTLFLAG_RD, &sc->serial_number_string,
1514		       0, "serial number");
1515	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1516		       "product_code",
1517		       CTLFLAG_RD, &sc->product_code_string,
1518		       0, "product_code");
1519	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1520		       "pcie_link_width",
1521		       CTLFLAG_RD, &sc->link_width,
1522		       0, "tx_boundary");
1523	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1524		       "tx_boundary",
1525		       CTLFLAG_RD, &sc->tx_boundary,
1526		       0, "tx_boundary");
1527	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1528		       "write_combine",
1529		       CTLFLAG_RD, &sc->wc,
1530		       0, "write combining PIO?");
1531	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1532		       "read_dma_MBs",
1533		       CTLFLAG_RD, &sc->read_dma,
1534		       0, "DMA Read speed in MB/s");
1535	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1536		       "write_dma_MBs",
1537		       CTLFLAG_RD, &sc->write_dma,
1538		       0, "DMA Write speed in MB/s");
1539	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1540		       "read_write_dma_MBs",
1541		       CTLFLAG_RD, &sc->read_write_dma,
1542		       0, "DMA concurrent Read/Write speed in MB/s");
1543	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1544		       "watchdog_resets",
1545		       CTLFLAG_RD, &sc->watchdog_resets,
1546		       0, "Number of times NIC was reset");
1547
1548
1549	/* performance related tunables */
1550	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1551			"intr_coal_delay",
1552			CTLTYPE_INT|CTLFLAG_RW, sc,
1553			0, mxge_change_intr_coal,
1554			"I", "interrupt coalescing delay in usecs");
1555
1556	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1557			"throttle",
1558			CTLTYPE_INT|CTLFLAG_RW, sc,
1559			0, mxge_change_throttle,
1560			"I", "transmit throttling");
1561
1562	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1563			"flow_control_enabled",
1564			CTLTYPE_INT|CTLFLAG_RW, sc,
1565			0, mxge_change_flow_control,
1566			"I", "interrupt coalescing delay in usecs");
1567
1568	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1569		       "deassert_wait",
1570		       CTLFLAG_RW, &mxge_deassert_wait,
1571		       0, "Wait for IRQ line to go low in ihandler");
1572
1573	/* stats block from firmware is in network byte order.
1574	   Need to swap it */
1575	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1576			"link_up",
1577			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1578			0, mxge_handle_be32,
1579			"I", "link up");
1580	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1581			"rdma_tags_available",
1582			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1583			0, mxge_handle_be32,
1584			"I", "rdma_tags_available");
1585	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1586			"dropped_bad_crc32",
1587			CTLTYPE_INT|CTLFLAG_RD,
1588			&fw->dropped_bad_crc32,
1589			0, mxge_handle_be32,
1590			"I", "dropped_bad_crc32");
1591	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1592			"dropped_bad_phy",
1593			CTLTYPE_INT|CTLFLAG_RD,
1594			&fw->dropped_bad_phy,
1595			0, mxge_handle_be32,
1596			"I", "dropped_bad_phy");
1597	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1598			"dropped_link_error_or_filtered",
1599			CTLTYPE_INT|CTLFLAG_RD,
1600			&fw->dropped_link_error_or_filtered,
1601			0, mxge_handle_be32,
1602			"I", "dropped_link_error_or_filtered");
1603	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1604			"dropped_link_overflow",
1605			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1606			0, mxge_handle_be32,
1607			"I", "dropped_link_overflow");
1608	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1609			"dropped_multicast_filtered",
1610			CTLTYPE_INT|CTLFLAG_RD,
1611			&fw->dropped_multicast_filtered,
1612			0, mxge_handle_be32,
1613			"I", "dropped_multicast_filtered");
1614	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1615			"dropped_no_big_buffer",
1616			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1617			0, mxge_handle_be32,
1618			"I", "dropped_no_big_buffer");
1619	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1620			"dropped_no_small_buffer",
1621			CTLTYPE_INT|CTLFLAG_RD,
1622			&fw->dropped_no_small_buffer,
1623			0, mxge_handle_be32,
1624			"I", "dropped_no_small_buffer");
1625	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1626			"dropped_overrun",
1627			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1628			0, mxge_handle_be32,
1629			"I", "dropped_overrun");
1630	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1631			"dropped_pause",
1632			CTLTYPE_INT|CTLFLAG_RD,
1633			&fw->dropped_pause,
1634			0, mxge_handle_be32,
1635			"I", "dropped_pause");
1636	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1637			"dropped_runt",
1638			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1639			0, mxge_handle_be32,
1640			"I", "dropped_runt");
1641
1642	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1643			"dropped_unicast_filtered",
1644			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1645			0, mxge_handle_be32,
1646			"I", "dropped_unicast_filtered");
1647
1648	/* verbose printing? */
1649	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1650		       "verbose",
1651		       CTLFLAG_RW, &mxge_verbose,
1652		       0, "verbose printing");
1653
1654	/* lro */
1655	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1656			"lro_cnt",
1657			CTLTYPE_INT|CTLFLAG_RW, sc,
1658			0, mxge_change_lro,
1659			"I", "number of lro merge queues");
1660
1661
1662	/* add counters exported for debugging from all slices */
1663	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1664	sc->slice_sysctl_tree =
1665		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1666				"slice", CTLFLAG_RD, 0, "");
1667
1668	for (slice = 0; slice < sc->num_slices; slice++) {
1669		ss = &sc->ss[slice];
1670		sysctl_ctx_init(&ss->sysctl_ctx);
1671		ctx = &ss->sysctl_ctx;
1672		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1673		sprintf(slice_num, "%d", slice);
1674		ss->sysctl_tree =
1675			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1676					CTLFLAG_RD, 0, "");
1677		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1678		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1679			       "rx_small_cnt",
1680			       CTLFLAG_RD, &ss->rx_small.cnt,
1681			       0, "rx_small_cnt");
1682		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1683			       "rx_big_cnt",
1684			       CTLFLAG_RD, &ss->rx_big.cnt,
1685			       0, "rx_small_cnt");
1686		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1687			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1688			       0, "number of lro merge queues flushed");
1689
1690		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1691			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1692			       0, "number of frames appended to lro merge"
1693			       "queues");
1694
1695#ifndef IFNET_BUF_RING
1696		/* only transmit from slice 0 for now */
1697		if (slice > 0)
1698			continue;
1699#endif
1700		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1701			       "tx_req",
1702			       CTLFLAG_RD, &ss->tx.req,
1703			       0, "tx_req");
1704
1705		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1706			       "tx_done",
1707			       CTLFLAG_RD, &ss->tx.done,
1708			       0, "tx_done");
1709		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1710			       "tx_pkt_done",
1711			       CTLFLAG_RD, &ss->tx.pkt_done,
1712			       0, "tx_done");
1713		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1714			       "tx_stall",
1715			       CTLFLAG_RD, &ss->tx.stall,
1716			       0, "tx_stall");
1717		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1718			       "tx_wake",
1719			       CTLFLAG_RD, &ss->tx.wake,
1720			       0, "tx_wake");
1721		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1722			       "tx_defrag",
1723			       CTLFLAG_RD, &ss->tx.defrag,
1724			       0, "tx_defrag");
1725		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1726			       "tx_queue_active",
1727			       CTLFLAG_RD, &ss->tx.queue_active,
1728			       0, "tx_queue_active");
1729		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1730			       "tx_activate",
1731			       CTLFLAG_RD, &ss->tx.activate,
1732			       0, "tx_activate");
1733		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1734			       "tx_deactivate",
1735			       CTLFLAG_RD, &ss->tx.deactivate,
1736			       0, "tx_deactivate");
1737	}
1738}
1739
1740/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1741   backwards one at a time and handle ring wraps */
1742
1743static inline void
1744mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1745			    mcp_kreq_ether_send_t *src, int cnt)
1746{
1747        int idx, starting_slot;
1748        starting_slot = tx->req;
1749        while (cnt > 1) {
1750                cnt--;
1751                idx = (starting_slot + cnt) & tx->mask;
1752                mxge_pio_copy(&tx->lanai[idx],
1753			      &src[cnt], sizeof(*src));
1754                wmb();
1755        }
1756}
1757
1758/*
1759 * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1760 * at most 32 bytes at a time, so as to avoid involving the software
1761 * pio handler in the nic.   We re-write the first segment's flags
1762 * to mark them valid only after writing the entire chain
1763 */
1764
1765static inline void
1766mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1767                  int cnt)
1768{
1769        int idx, i;
1770        uint32_t *src_ints;
1771	volatile uint32_t *dst_ints;
1772        mcp_kreq_ether_send_t *srcp;
1773	volatile mcp_kreq_ether_send_t *dstp, *dst;
1774	uint8_t last_flags;
1775
1776        idx = tx->req & tx->mask;
1777
1778	last_flags = src->flags;
1779	src->flags = 0;
1780        wmb();
1781        dst = dstp = &tx->lanai[idx];
1782        srcp = src;
1783
1784        if ((idx + cnt) < tx->mask) {
1785                for (i = 0; i < (cnt - 1); i += 2) {
1786                        mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1787                        wmb(); /* force write every 32 bytes */
1788                        srcp += 2;
1789                        dstp += 2;
1790                }
1791        } else {
1792                /* submit all but the first request, and ensure
1793                   that it is submitted below */
1794                mxge_submit_req_backwards(tx, src, cnt);
1795                i = 0;
1796        }
1797        if (i < cnt) {
1798                /* submit the first request */
1799                mxge_pio_copy(dstp, srcp, sizeof(*src));
1800                wmb(); /* barrier before setting valid flag */
1801        }
1802
1803        /* re-write the last 32-bits with the valid flags */
1804        src->flags = last_flags;
1805        src_ints = (uint32_t *)src;
1806        src_ints+=3;
1807        dst_ints = (volatile uint32_t *)dst;
1808        dst_ints+=3;
1809        *dst_ints =  *src_ints;
1810        tx->req += cnt;
1811        wmb();
1812}
1813
1814#if IFCAP_TSO4
1815
1816static void
1817mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1818	       int busdma_seg_cnt, int ip_off)
1819{
1820	mxge_tx_ring_t *tx;
1821	mcp_kreq_ether_send_t *req;
1822	bus_dma_segment_t *seg;
1823	struct ip *ip;
1824	struct tcphdr *tcp;
1825	uint32_t low, high_swapped;
1826	int len, seglen, cum_len, cum_len_next;
1827	int next_is_first, chop, cnt, rdma_count, small;
1828	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1829	uint8_t flags, flags_next;
1830	static int once;
1831
1832	mss = m->m_pkthdr.tso_segsz;
1833
1834	/* negative cum_len signifies to the
1835	 * send loop that we are still in the
1836	 * header portion of the TSO packet.
1837	 */
1838
1839	/* ensure we have the ethernet, IP and TCP
1840	   header together in the first mbuf, copy
1841	   it to a scratch buffer if not */
1842	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1843		m_copydata(m, 0, ip_off + sizeof (*ip),
1844			   ss->scratch);
1845		ip = (struct ip *)(ss->scratch + ip_off);
1846	} else {
1847		ip = (struct ip *)(mtod(m, char *) + ip_off);
1848	}
1849	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1850			    + sizeof (*tcp))) {
1851		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1852			   + sizeof (*tcp),  ss->scratch);
1853		ip = (struct ip *)(mtod(m, char *) + ip_off);
1854	}
1855
1856	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1857	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1858
1859	/* TSO implies checksum offload on this hardware */
1860	cksum_offset = ip_off + (ip->ip_hl << 2);
1861	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1862
1863
1864	/* for TSO, pseudo_hdr_offset holds mss.
1865	 * The firmware figures out where to put
1866	 * the checksum by parsing the header. */
1867	pseudo_hdr_offset = htobe16(mss);
1868
1869	tx = &ss->tx;
1870	req = tx->req_list;
1871	seg = tx->seg_list;
1872	cnt = 0;
1873	rdma_count = 0;
1874	/* "rdma_count" is the number of RDMAs belonging to the
1875	 * current packet BEFORE the current send request. For
1876	 * non-TSO packets, this is equal to "count".
1877	 * For TSO packets, rdma_count needs to be reset
1878	 * to 0 after a segment cut.
1879	 *
1880	 * The rdma_count field of the send request is
1881	 * the number of RDMAs of the packet starting at
1882	 * that request. For TSO send requests with one ore more cuts
1883	 * in the middle, this is the number of RDMAs starting
1884	 * after the last cut in the request. All previous
1885	 * segments before the last cut implicitly have 1 RDMA.
1886	 *
1887	 * Since the number of RDMAs is not known beforehand,
1888	 * it must be filled-in retroactively - after each
1889	 * segmentation cut or at the end of the entire packet.
1890	 */
1891
1892	while (busdma_seg_cnt) {
1893		/* Break the busdma segment up into pieces*/
1894		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1895		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1896		len = seg->ds_len;
1897
1898		while (len) {
1899			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1900			seglen = len;
1901			cum_len_next = cum_len + seglen;
1902			(req-rdma_count)->rdma_count = rdma_count + 1;
1903			if (__predict_true(cum_len >= 0)) {
1904				/* payload */
1905				chop = (cum_len_next > mss);
1906				cum_len_next = cum_len_next % mss;
1907				next_is_first = (cum_len_next == 0);
1908				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1909				flags_next |= next_is_first *
1910					MXGEFW_FLAGS_FIRST;
1911				rdma_count |= -(chop | next_is_first);
1912				rdma_count += chop & !next_is_first;
1913			} else if (cum_len_next >= 0) {
1914				/* header ends */
1915				rdma_count = -1;
1916				cum_len_next = 0;
1917				seglen = -cum_len;
1918				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1919				flags_next = MXGEFW_FLAGS_TSO_PLD |
1920					MXGEFW_FLAGS_FIRST |
1921					(small * MXGEFW_FLAGS_SMALL);
1922			    }
1923
1924			req->addr_high = high_swapped;
1925			req->addr_low = htobe32(low);
1926			req->pseudo_hdr_offset = pseudo_hdr_offset;
1927			req->pad = 0;
1928			req->rdma_count = 1;
1929			req->length = htobe16(seglen);
1930			req->cksum_offset = cksum_offset;
1931			req->flags = flags | ((cum_len & 1) *
1932					      MXGEFW_FLAGS_ALIGN_ODD);
1933			low += seglen;
1934			len -= seglen;
1935			cum_len = cum_len_next;
1936			flags = flags_next;
1937			req++;
1938			cnt++;
1939			rdma_count++;
1940			if (__predict_false(cksum_offset > seglen))
1941				cksum_offset -= seglen;
1942			else
1943				cksum_offset = 0;
1944			if (__predict_false(cnt > tx->max_desc))
1945				goto drop;
1946		}
1947		busdma_seg_cnt--;
1948		seg++;
1949	}
1950	(req-rdma_count)->rdma_count = rdma_count;
1951
1952	do {
1953		req--;
1954		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1955	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1956
1957	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1958	mxge_submit_req(tx, tx->req_list, cnt);
1959#ifdef IFNET_BUF_RING
1960	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1961		/* tell the NIC to start polling this slice */
1962		*tx->send_go = 1;
1963		tx->queue_active = 1;
1964		tx->activate++;
1965		wmb();
1966	}
1967#endif
1968	return;
1969
1970drop:
1971	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1972	m_freem(m);
1973	ss->oerrors++;
1974	if (!once) {
1975		printf("tx->max_desc exceeded via TSO!\n");
1976		printf("mss = %d, %ld, %d!\n", mss,
1977		       (long)seg - (long)tx->seg_list, tx->max_desc);
1978		once = 1;
1979	}
1980	return;
1981
1982}
1983
1984#endif /* IFCAP_TSO4 */
1985
1986#ifdef MXGE_NEW_VLAN_API
1987/*
1988 * We reproduce the software vlan tag insertion from
1989 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1990 * vlan tag insertion. We need to advertise this in order to have the
1991 * vlan interface respect our csum offload flags.
1992 */
1993static struct mbuf *
1994mxge_vlan_tag_insert(struct mbuf *m)
1995{
1996	struct ether_vlan_header *evl;
1997
1998	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1999	if (__predict_false(m == NULL))
2000		return NULL;
2001	if (m->m_len < sizeof(*evl)) {
2002		m = m_pullup(m, sizeof(*evl));
2003		if (__predict_false(m == NULL))
2004			return NULL;
2005	}
2006	/*
2007	 * Transform the Ethernet header into an Ethernet header
2008	 * with 802.1Q encapsulation.
2009	 */
2010	evl = mtod(m, struct ether_vlan_header *);
2011	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2012	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2013	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2014	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2015	m->m_flags &= ~M_VLANTAG;
2016	return m;
2017}
2018#endif /* MXGE_NEW_VLAN_API */
2019
2020static void
2021mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2022{
2023	mxge_softc_t *sc;
2024	mcp_kreq_ether_send_t *req;
2025	bus_dma_segment_t *seg;
2026	struct mbuf *m_tmp;
2027	struct ifnet *ifp;
2028	mxge_tx_ring_t *tx;
2029	struct ip *ip;
2030	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2031	uint16_t pseudo_hdr_offset;
2032        uint8_t flags, cksum_offset;
2033
2034
2035	sc = ss->sc;
2036	ifp = sc->ifp;
2037	tx = &ss->tx;
2038
2039	ip_off = sizeof (struct ether_header);
2040#ifdef MXGE_NEW_VLAN_API
2041	if (m->m_flags & M_VLANTAG) {
2042		m = mxge_vlan_tag_insert(m);
2043		if (__predict_false(m == NULL))
2044			goto drop;
2045		ip_off += ETHER_VLAN_ENCAP_LEN;
2046	}
2047#endif
2048	/* (try to) map the frame for DMA */
2049	idx = tx->req & tx->mask;
2050	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2051				      m, tx->seg_list, &cnt,
2052				      BUS_DMA_NOWAIT);
2053	if (__predict_false(err == EFBIG)) {
2054		/* Too many segments in the chain.  Try
2055		   to defrag */
2056		m_tmp = m_defrag(m, M_NOWAIT);
2057		if (m_tmp == NULL) {
2058			goto drop;
2059		}
2060		ss->tx.defrag++;
2061		m = m_tmp;
2062		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2063					      tx->info[idx].map,
2064					      m, tx->seg_list, &cnt,
2065					      BUS_DMA_NOWAIT);
2066	}
2067	if (__predict_false(err != 0)) {
2068		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2069			      " packet len = %d\n", err, m->m_pkthdr.len);
2070		goto drop;
2071	}
2072	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2073			BUS_DMASYNC_PREWRITE);
2074	tx->info[idx].m = m;
2075
2076#if IFCAP_TSO4
2077	/* TSO is different enough, we handle it in another routine */
2078	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2079		mxge_encap_tso(ss, m, cnt, ip_off);
2080		return;
2081	}
2082#endif
2083
2084	req = tx->req_list;
2085	cksum_offset = 0;
2086	pseudo_hdr_offset = 0;
2087	flags = MXGEFW_FLAGS_NO_TSO;
2088
2089	/* checksum offloading? */
2090	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2091		/* ensure ip header is in first mbuf, copy
2092		   it to a scratch buffer if not */
2093		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2094			m_copydata(m, 0, ip_off + sizeof (*ip),
2095				   ss->scratch);
2096			ip = (struct ip *)(ss->scratch + ip_off);
2097		} else {
2098			ip = (struct ip *)(mtod(m, char *) + ip_off);
2099		}
2100		cksum_offset = ip_off + (ip->ip_hl << 2);
2101		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2102		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2103		req->cksum_offset = cksum_offset;
2104		flags |= MXGEFW_FLAGS_CKSUM;
2105		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2106	} else {
2107		odd_flag = 0;
2108	}
2109	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2110		flags |= MXGEFW_FLAGS_SMALL;
2111
2112	/* convert segments into a request list */
2113	cum_len = 0;
2114	seg = tx->seg_list;
2115	req->flags = MXGEFW_FLAGS_FIRST;
2116	for (i = 0; i < cnt; i++) {
2117		req->addr_low =
2118			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2119		req->addr_high =
2120			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2121		req->length = htobe16(seg->ds_len);
2122		req->cksum_offset = cksum_offset;
2123		if (cksum_offset > seg->ds_len)
2124			cksum_offset -= seg->ds_len;
2125		else
2126			cksum_offset = 0;
2127		req->pseudo_hdr_offset = pseudo_hdr_offset;
2128		req->pad = 0; /* complete solid 16-byte block */
2129		req->rdma_count = 1;
2130		req->flags |= flags | ((cum_len & 1) * odd_flag);
2131		cum_len += seg->ds_len;
2132		seg++;
2133		req++;
2134		req->flags = 0;
2135	}
2136	req--;
2137	/* pad runts to 60 bytes */
2138	if (cum_len < 60) {
2139		req++;
2140		req->addr_low =
2141			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2142		req->addr_high =
2143			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2144		req->length = htobe16(60 - cum_len);
2145		req->cksum_offset = 0;
2146		req->pseudo_hdr_offset = pseudo_hdr_offset;
2147		req->pad = 0; /* complete solid 16-byte block */
2148		req->rdma_count = 1;
2149		req->flags |= flags | ((cum_len & 1) * odd_flag);
2150		cnt++;
2151	}
2152
2153	tx->req_list[0].rdma_count = cnt;
2154#if 0
2155	/* print what the firmware will see */
2156	for (i = 0; i < cnt; i++) {
2157		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2158		    "cso:%d, flags:0x%x, rdma:%d\n",
2159		    i, (int)ntohl(tx->req_list[i].addr_high),
2160		    (int)ntohl(tx->req_list[i].addr_low),
2161		    (int)ntohs(tx->req_list[i].length),
2162		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2163		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2164		    tx->req_list[i].rdma_count);
2165	}
2166	printf("--------------\n");
2167#endif
2168	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2169	mxge_submit_req(tx, tx->req_list, cnt);
2170#ifdef IFNET_BUF_RING
2171	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2172		/* tell the NIC to start polling this slice */
2173		*tx->send_go = 1;
2174		tx->queue_active = 1;
2175		tx->activate++;
2176		wmb();
2177	}
2178#endif
2179	return;
2180
2181drop:
2182	m_freem(m);
2183	ss->oerrors++;
2184	return;
2185}
2186
2187#ifdef IFNET_BUF_RING
2188static void
2189mxge_qflush(struct ifnet *ifp)
2190{
2191	mxge_softc_t *sc = ifp->if_softc;
2192	mxge_tx_ring_t *tx;
2193	struct mbuf *m;
2194	int slice;
2195
2196	for (slice = 0; slice < sc->num_slices; slice++) {
2197		tx = &sc->ss[slice].tx;
2198		mtx_lock(&tx->mtx);
2199		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2200			m_freem(m);
2201		mtx_unlock(&tx->mtx);
2202	}
2203	if_qflush(ifp);
2204}
2205
2206static inline void
2207mxge_start_locked(struct mxge_slice_state *ss)
2208{
2209	mxge_softc_t *sc;
2210	struct mbuf *m;
2211	struct ifnet *ifp;
2212	mxge_tx_ring_t *tx;
2213
2214	sc = ss->sc;
2215	ifp = sc->ifp;
2216	tx = &ss->tx;
2217
2218	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2219		m = drbr_dequeue(ifp, tx->br);
2220		if (m == NULL) {
2221			return;
2222		}
2223		/* let BPF see it */
2224		BPF_MTAP(ifp, m);
2225
2226		/* give it to the nic */
2227		mxge_encap(ss, m);
2228	}
2229	/* ran out of transmit slots */
2230	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2231	    && (!drbr_empty(ifp, tx->br))) {
2232		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2233		tx->stall++;
2234	}
2235}
2236
2237static int
2238mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2239{
2240	mxge_softc_t *sc;
2241	struct ifnet *ifp;
2242	mxge_tx_ring_t *tx;
2243	int err;
2244
2245	sc = ss->sc;
2246	ifp = sc->ifp;
2247	tx = &ss->tx;
2248
2249	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2250	    IFF_DRV_RUNNING) {
2251		err = drbr_enqueue(ifp, tx->br, m);
2252		return (err);
2253	}
2254
2255	if (!drbr_needs_enqueue(ifp, tx->br) &&
2256	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2257		/* let BPF see it */
2258		BPF_MTAP(ifp, m);
2259		/* give it to the nic */
2260		mxge_encap(ss, m);
2261	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2262		return (err);
2263	}
2264	if (!drbr_empty(ifp, tx->br))
2265		mxge_start_locked(ss);
2266	return (0);
2267}
2268
2269static int
2270mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2271{
2272	mxge_softc_t *sc = ifp->if_softc;
2273	struct mxge_slice_state *ss;
2274	mxge_tx_ring_t *tx;
2275	int err = 0;
2276	int slice;
2277
2278	slice = m->m_pkthdr.flowid;
2279	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2280
2281	ss = &sc->ss[slice];
2282	tx = &ss->tx;
2283
2284	if (mtx_trylock(&tx->mtx)) {
2285		err = mxge_transmit_locked(ss, m);
2286		mtx_unlock(&tx->mtx);
2287	} else {
2288		err = drbr_enqueue(ifp, tx->br, m);
2289	}
2290
2291	return (err);
2292}
2293
2294#else
2295
2296static inline void
2297mxge_start_locked(struct mxge_slice_state *ss)
2298{
2299	mxge_softc_t *sc;
2300	struct mbuf *m;
2301	struct ifnet *ifp;
2302	mxge_tx_ring_t *tx;
2303
2304	sc = ss->sc;
2305	ifp = sc->ifp;
2306	tx = &ss->tx;
2307	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2308		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2309		if (m == NULL) {
2310			return;
2311		}
2312		/* let BPF see it */
2313		BPF_MTAP(ifp, m);
2314
2315		/* give it to the nic */
2316		mxge_encap(ss, m);
2317	}
2318	/* ran out of transmit slots */
2319	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2320		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2321		tx->stall++;
2322	}
2323}
2324#endif
2325static void
2326mxge_start(struct ifnet *ifp)
2327{
2328	mxge_softc_t *sc = ifp->if_softc;
2329	struct mxge_slice_state *ss;
2330
2331	/* only use the first slice for now */
2332	ss = &sc->ss[0];
2333	mtx_lock(&ss->tx.mtx);
2334	mxge_start_locked(ss);
2335	mtx_unlock(&ss->tx.mtx);
2336}
2337
2338/*
2339 * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2340 * at most 32 bytes at a time, so as to avoid involving the software
2341 * pio handler in the nic.   We re-write the first segment's low
2342 * DMA address to mark it valid only after we write the entire chunk
2343 * in a burst
2344 */
2345static inline void
2346mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2347		mcp_kreq_ether_recv_t *src)
2348{
2349	uint32_t low;
2350
2351	low = src->addr_low;
2352	src->addr_low = 0xffffffff;
2353	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2354	wmb();
2355	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2356	wmb();
2357	src->addr_low = low;
2358	dst->addr_low = low;
2359	wmb();
2360}
2361
2362static int
2363mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2364{
2365	bus_dma_segment_t seg;
2366	struct mbuf *m;
2367	mxge_rx_ring_t *rx = &ss->rx_small;
2368	int cnt, err;
2369
2370	m = m_gethdr(M_DONTWAIT, MT_DATA);
2371	if (m == NULL) {
2372		rx->alloc_fail++;
2373		err = ENOBUFS;
2374		goto done;
2375	}
2376	m->m_len = MHLEN;
2377	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2378				      &seg, &cnt, BUS_DMA_NOWAIT);
2379	if (err != 0) {
2380		m_free(m);
2381		goto done;
2382	}
2383	rx->info[idx].m = m;
2384	rx->shadow[idx].addr_low =
2385		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2386	rx->shadow[idx].addr_high =
2387		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2388
2389done:
2390	if ((idx & 7) == 7)
2391		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2392	return err;
2393}
2394
2395static int
2396mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2397{
2398	bus_dma_segment_t seg[3];
2399	struct mbuf *m;
2400	mxge_rx_ring_t *rx = &ss->rx_big;
2401	int cnt, err, i;
2402
2403	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2404	if (m == NULL) {
2405		rx->alloc_fail++;
2406		err = ENOBUFS;
2407		goto done;
2408	}
2409	m->m_len = rx->mlen;
2410	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2411				      seg, &cnt, BUS_DMA_NOWAIT);
2412	if (err != 0) {
2413		m_free(m);
2414		goto done;
2415	}
2416	rx->info[idx].m = m;
2417	rx->shadow[idx].addr_low =
2418		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2419	rx->shadow[idx].addr_high =
2420		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2421
2422#if MXGE_VIRT_JUMBOS
2423	for (i = 1; i < cnt; i++) {
2424		rx->shadow[idx + i].addr_low =
2425			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2426		rx->shadow[idx + i].addr_high =
2427			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2428       }
2429#endif
2430
2431done:
2432       for (i = 0; i < rx->nbufs; i++) {
2433		if ((idx & 7) == 7) {
2434			mxge_submit_8rx(&rx->lanai[idx - 7],
2435					&rx->shadow[idx - 7]);
2436		}
2437		idx++;
2438	}
2439	return err;
2440}
2441
2442/*
2443 *  Myri10GE hardware checksums are not valid if the sender
2444 *  padded the frame with non-zero padding.  This is because
2445 *  the firmware just does a simple 16-bit 1s complement
2446 *  checksum across the entire frame, excluding the first 14
2447 *  bytes.  It is best to simply to check the checksum and
2448 *  tell the stack about it only if the checksum is good
2449 */
2450
2451static inline uint16_t
2452mxge_rx_csum(struct mbuf *m, int csum)
2453{
2454	struct ether_header *eh;
2455	struct ip *ip;
2456	uint16_t c;
2457
2458	eh = mtod(m, struct ether_header *);
2459
2460	/* only deal with IPv4 TCP & UDP for now */
2461	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2462		return 1;
2463	ip = (struct ip *)(eh + 1);
2464	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2465			    ip->ip_p != IPPROTO_UDP))
2466		return 1;
2467#ifdef INET
2468	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2469		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2470			    - (ip->ip_hl << 2) + ip->ip_p));
2471#else
2472	c = 1;
2473#endif
2474	c ^= 0xffff;
2475	return (c);
2476}
2477
2478static void
2479mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2480{
2481	struct ether_vlan_header *evl;
2482	struct ether_header *eh;
2483	uint32_t partial;
2484
2485	evl = mtod(m, struct ether_vlan_header *);
2486	eh = mtod(m, struct ether_header *);
2487
2488	/*
2489	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2490	 * after what the firmware thought was the end of the ethernet
2491	 * header.
2492	 */
2493
2494	/* put checksum into host byte order */
2495	*csum = ntohs(*csum);
2496	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2497	(*csum) += ~partial;
2498	(*csum) +=  ((*csum) < ~partial);
2499	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2500	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2501
2502	/* restore checksum to network byte order;
2503	   later consumers expect this */
2504	*csum = htons(*csum);
2505
2506	/* save the tag */
2507#ifdef MXGE_NEW_VLAN_API
2508	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2509#else
2510	{
2511		struct m_tag *mtag;
2512		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2513				   M_NOWAIT);
2514		if (mtag == NULL)
2515			return;
2516		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2517		m_tag_prepend(m, mtag);
2518	}
2519
2520#endif
2521	m->m_flags |= M_VLANTAG;
2522
2523	/*
2524	 * Remove the 802.1q header by copying the Ethernet
2525	 * addresses over it and adjusting the beginning of
2526	 * the data in the mbuf.  The encapsulated Ethernet
2527	 * type field is already in place.
2528	 */
2529	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2530	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2531	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2532}
2533
2534
2535static inline void
2536mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2537{
2538	mxge_softc_t *sc;
2539	struct ifnet *ifp;
2540	struct mbuf *m;
2541	struct ether_header *eh;
2542	mxge_rx_ring_t *rx;
2543	bus_dmamap_t old_map;
2544	int idx;
2545	uint16_t tcpudp_csum;
2546
2547	sc = ss->sc;
2548	ifp = sc->ifp;
2549	rx = &ss->rx_big;
2550	idx = rx->cnt & rx->mask;
2551	rx->cnt += rx->nbufs;
2552	/* save a pointer to the received mbuf */
2553	m = rx->info[idx].m;
2554	/* try to replace the received mbuf */
2555	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2556		/* drop the frame -- the old mbuf is re-cycled */
2557		ifp->if_ierrors++;
2558		return;
2559	}
2560
2561	/* unmap the received buffer */
2562	old_map = rx->info[idx].map;
2563	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2564	bus_dmamap_unload(rx->dmat, old_map);
2565
2566	/* swap the bus_dmamap_t's */
2567	rx->info[idx].map = rx->extra_map;
2568	rx->extra_map = old_map;
2569
2570	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2571	 * aligned */
2572	m->m_data += MXGEFW_PAD;
2573
2574	m->m_pkthdr.rcvif = ifp;
2575	m->m_len = m->m_pkthdr.len = len;
2576	ss->ipackets++;
2577	eh = mtod(m, struct ether_header *);
2578	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2579		mxge_vlan_tag_remove(m, &csum);
2580	}
2581	/* if the checksum is valid, mark it in the mbuf header */
2582	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2583		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2584			return;
2585		/* otherwise, it was a UDP frame, or a TCP frame which
2586		   we could not do LRO on.  Tell the stack that the
2587		   checksum is good */
2588		m->m_pkthdr.csum_data = 0xffff;
2589		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2590	}
2591	/* flowid only valid if RSS hashing is enabled */
2592	if (sc->num_slices > 1) {
2593		m->m_pkthdr.flowid = (ss - sc->ss);
2594		m->m_flags |= M_FLOWID;
2595	}
2596	/* pass the frame up the stack */
2597	(*ifp->if_input)(ifp, m);
2598}
2599
2600static inline void
2601mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2602{
2603	mxge_softc_t *sc;
2604	struct ifnet *ifp;
2605	struct ether_header *eh;
2606	struct mbuf *m;
2607	mxge_rx_ring_t *rx;
2608	bus_dmamap_t old_map;
2609	int idx;
2610	uint16_t tcpudp_csum;
2611
2612	sc = ss->sc;
2613	ifp = sc->ifp;
2614	rx = &ss->rx_small;
2615	idx = rx->cnt & rx->mask;
2616	rx->cnt++;
2617	/* save a pointer to the received mbuf */
2618	m = rx->info[idx].m;
2619	/* try to replace the received mbuf */
2620	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2621		/* drop the frame -- the old mbuf is re-cycled */
2622		ifp->if_ierrors++;
2623		return;
2624	}
2625
2626	/* unmap the received buffer */
2627	old_map = rx->info[idx].map;
2628	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2629	bus_dmamap_unload(rx->dmat, old_map);
2630
2631	/* swap the bus_dmamap_t's */
2632	rx->info[idx].map = rx->extra_map;
2633	rx->extra_map = old_map;
2634
2635	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2636	 * aligned */
2637	m->m_data += MXGEFW_PAD;
2638
2639	m->m_pkthdr.rcvif = ifp;
2640	m->m_len = m->m_pkthdr.len = len;
2641	ss->ipackets++;
2642	eh = mtod(m, struct ether_header *);
2643	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2644		mxge_vlan_tag_remove(m, &csum);
2645	}
2646	/* if the checksum is valid, mark it in the mbuf header */
2647	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2648		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2649			return;
2650		/* otherwise, it was a UDP frame, or a TCP frame which
2651		   we could not do LRO on.  Tell the stack that the
2652		   checksum is good */
2653		m->m_pkthdr.csum_data = 0xffff;
2654		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2655	}
2656	/* flowid only valid if RSS hashing is enabled */
2657	if (sc->num_slices > 1) {
2658		m->m_pkthdr.flowid = (ss - sc->ss);
2659		m->m_flags |= M_FLOWID;
2660	}
2661	/* pass the frame up the stack */
2662	(*ifp->if_input)(ifp, m);
2663}
2664
2665static inline void
2666mxge_clean_rx_done(struct mxge_slice_state *ss)
2667{
2668	mxge_rx_done_t *rx_done = &ss->rx_done;
2669	int limit = 0;
2670	uint16_t length;
2671	uint16_t checksum;
2672
2673
2674	while (rx_done->entry[rx_done->idx].length != 0) {
2675		length = ntohs(rx_done->entry[rx_done->idx].length);
2676		rx_done->entry[rx_done->idx].length = 0;
2677		checksum = rx_done->entry[rx_done->idx].checksum;
2678		if (length <= (MHLEN - MXGEFW_PAD))
2679			mxge_rx_done_small(ss, length, checksum);
2680		else
2681			mxge_rx_done_big(ss, length, checksum);
2682		rx_done->cnt++;
2683		rx_done->idx = rx_done->cnt & rx_done->mask;
2684
2685		/* limit potential for livelock */
2686		if (__predict_false(++limit > rx_done->mask / 2))
2687			break;
2688	}
2689#ifdef INET
2690	while (!SLIST_EMPTY(&ss->lro_active)) {
2691		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2692		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2693		mxge_lro_flush(ss, lro);
2694	}
2695#endif
2696}
2697
2698
2699static inline void
2700mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2701{
2702	struct ifnet *ifp;
2703	mxge_tx_ring_t *tx;
2704	struct mbuf *m;
2705	bus_dmamap_t map;
2706	int idx;
2707	int *flags;
2708
2709	tx = &ss->tx;
2710	ifp = ss->sc->ifp;
2711	while (tx->pkt_done != mcp_idx) {
2712		idx = tx->done & tx->mask;
2713		tx->done++;
2714		m = tx->info[idx].m;
2715		/* mbuf and DMA map only attached to the first
2716		   segment per-mbuf */
2717		if (m != NULL) {
2718			ss->obytes += m->m_pkthdr.len;
2719			if (m->m_flags & M_MCAST)
2720				ss->omcasts++;
2721			ss->opackets++;
2722			tx->info[idx].m = NULL;
2723			map = tx->info[idx].map;
2724			bus_dmamap_unload(tx->dmat, map);
2725			m_freem(m);
2726		}
2727		if (tx->info[idx].flag) {
2728			tx->info[idx].flag = 0;
2729			tx->pkt_done++;
2730		}
2731	}
2732
2733	/* If we have space, clear IFF_OACTIVE to tell the stack that
2734           its OK to send packets */
2735#ifdef IFNET_BUF_RING
2736	flags = &ss->if_drv_flags;
2737#else
2738	flags = &ifp->if_drv_flags;
2739#endif
2740	mtx_lock(&ss->tx.mtx);
2741	if ((*flags) & IFF_DRV_OACTIVE &&
2742	    tx->req - tx->done < (tx->mask + 1)/4) {
2743		*(flags) &= ~IFF_DRV_OACTIVE;
2744		ss->tx.wake++;
2745		mxge_start_locked(ss);
2746	}
2747#ifdef IFNET_BUF_RING
2748	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2749		/* let the NIC stop polling this queue, since there
2750		 * are no more transmits pending */
2751		if (tx->req == tx->done) {
2752			*tx->send_stop = 1;
2753			tx->queue_active = 0;
2754			tx->deactivate++;
2755			wmb();
2756		}
2757	}
2758#endif
2759	mtx_unlock(&ss->tx.mtx);
2760
2761}
2762
2763static struct mxge_media_type mxge_xfp_media_types[] =
2764{
2765	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2766	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2767	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2768	{0,		(1 << 5),	"10GBASE-ER"},
2769	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2770	{0,		(1 << 3),	"10GBASE-SW"},
2771	{0,		(1 << 2),	"10GBASE-LW"},
2772	{0,		(1 << 1),	"10GBASE-EW"},
2773	{0,		(1 << 0),	"Reserved"}
2774};
2775static struct mxge_media_type mxge_sfp_media_types[] =
2776{
2777	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2778	{0,		(1 << 7),	"Reserved"},
2779	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2780	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2781	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"}
2782};
2783
2784static void
2785mxge_media_set(mxge_softc_t *sc, int media_type)
2786{
2787
2788
2789	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2790		    0, NULL);
2791	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2792	sc->current_media = media_type;
2793	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2794}
2795
2796static void
2797mxge_media_init(mxge_softc_t *sc)
2798{
2799	char *ptr;
2800	int i;
2801
2802	ifmedia_removeall(&sc->media);
2803	mxge_media_set(sc, IFM_AUTO);
2804
2805	/*
2806	 * parse the product code to deterimine the interface type
2807	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2808	 * after the 3rd dash in the driver's cached copy of the
2809	 * EEPROM's product code string.
2810	 */
2811	ptr = sc->product_code_string;
2812	if (ptr == NULL) {
2813		device_printf(sc->dev, "Missing product code\n");
2814		return;
2815	}
2816
2817	for (i = 0; i < 3; i++, ptr++) {
2818		ptr = index(ptr, '-');
2819		if (ptr == NULL) {
2820			device_printf(sc->dev,
2821				      "only %d dashes in PC?!?\n", i);
2822			return;
2823		}
2824	}
2825	if (*ptr == 'C') {
2826		/* -C is CX4 */
2827		sc->connector = MXGE_CX4;
2828		mxge_media_set(sc, IFM_10G_CX4);
2829	} else if (*ptr == 'Q') {
2830		/* -Q is Quad Ribbon Fiber */
2831		sc->connector = MXGE_QRF;
2832		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2833		/* FreeBSD has no media type for Quad ribbon fiber */
2834	} else if (*ptr == 'R') {
2835		/* -R is XFP */
2836		sc->connector = MXGE_XFP;
2837	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2838		/* -S or -2S is SFP+ */
2839		sc->connector = MXGE_SFP;
2840	} else {
2841		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2842	}
2843}
2844
2845/*
2846 * Determine the media type for a NIC.  Some XFPs will identify
2847 * themselves only when their link is up, so this is initiated via a
2848 * link up interrupt.  However, this can potentially take up to
2849 * several milliseconds, so it is run via the watchdog routine, rather
2850 * than in the interrupt handler itself.
2851 */
2852static void
2853mxge_media_probe(mxge_softc_t *sc)
2854{
2855	mxge_cmd_t cmd;
2856	char *cage_type;
2857
2858	struct mxge_media_type *mxge_media_types = NULL;
2859	int i, err, ms, mxge_media_type_entries;
2860	uint32_t byte;
2861
2862	sc->need_media_probe = 0;
2863
2864	if (sc->connector == MXGE_XFP) {
2865		/* -R is XFP */
2866		mxge_media_types = mxge_xfp_media_types;
2867		mxge_media_type_entries =
2868			sizeof (mxge_xfp_media_types) /
2869			sizeof (mxge_xfp_media_types[0]);
2870		byte = MXGE_XFP_COMPLIANCE_BYTE;
2871		cage_type = "XFP";
2872	} else 	if (sc->connector == MXGE_SFP) {
2873		/* -S or -2S is SFP+ */
2874		mxge_media_types = mxge_sfp_media_types;
2875		mxge_media_type_entries =
2876			sizeof (mxge_sfp_media_types) /
2877			sizeof (mxge_sfp_media_types[0]);
2878		cage_type = "SFP+";
2879		byte = 3;
2880	} else {
2881		/* nothing to do; media type cannot change */
2882		return;
2883	}
2884
2885	/*
2886	 * At this point we know the NIC has an XFP cage, so now we
2887	 * try to determine what is in the cage by using the
2888	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2889	 * register.  We read just one byte, which may take over
2890	 * a millisecond
2891	 */
2892
2893	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2894	cmd.data1 = byte;
2895	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2896	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2897		device_printf(sc->dev, "failed to read XFP\n");
2898	}
2899	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2900		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2901	}
2902	if (err != MXGEFW_CMD_OK) {
2903		return;
2904	}
2905
2906	/* now we wait for the data to be cached */
2907	cmd.data0 = byte;
2908	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2909	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2910		DELAY(1000);
2911		cmd.data0 = byte;
2912		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2913	}
2914	if (err != MXGEFW_CMD_OK) {
2915		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2916			      cage_type, err, ms);
2917		return;
2918	}
2919
2920	if (cmd.data0 == mxge_media_types[0].bitmask) {
2921		if (mxge_verbose)
2922			device_printf(sc->dev, "%s:%s\n", cage_type,
2923				      mxge_media_types[0].name);
2924		if (sc->current_media != mxge_media_types[0].flag) {
2925			mxge_media_init(sc);
2926			mxge_media_set(sc, mxge_media_types[0].flag);
2927		}
2928		return;
2929	}
2930	for (i = 1; i < mxge_media_type_entries; i++) {
2931		if (cmd.data0 & mxge_media_types[i].bitmask) {
2932			if (mxge_verbose)
2933				device_printf(sc->dev, "%s:%s\n",
2934					      cage_type,
2935					      mxge_media_types[i].name);
2936
2937			if (sc->current_media != mxge_media_types[i].flag) {
2938				mxge_media_init(sc);
2939				mxge_media_set(sc, mxge_media_types[i].flag);
2940			}
2941			return;
2942		}
2943	}
2944	if (mxge_verbose)
2945		device_printf(sc->dev, "%s media 0x%x unknown\n",
2946			      cage_type, cmd.data0);
2947
2948	return;
2949}
2950
2951static void
2952mxge_intr(void *arg)
2953{
2954	struct mxge_slice_state *ss = arg;
2955	mxge_softc_t *sc = ss->sc;
2956	mcp_irq_data_t *stats = ss->fw_stats;
2957	mxge_tx_ring_t *tx = &ss->tx;
2958	mxge_rx_done_t *rx_done = &ss->rx_done;
2959	uint32_t send_done_count;
2960	uint8_t valid;
2961
2962
2963#ifndef IFNET_BUF_RING
2964	/* an interrupt on a non-zero slice is implicitly valid
2965	   since MSI-X irqs are not shared */
2966	if (ss != sc->ss) {
2967		mxge_clean_rx_done(ss);
2968		*ss->irq_claim = be32toh(3);
2969		return;
2970	}
2971#endif
2972
2973	/* make sure the DMA has finished */
2974	if (!stats->valid) {
2975		return;
2976	}
2977	valid = stats->valid;
2978
2979	if (sc->legacy_irq) {
2980		/* lower legacy IRQ  */
2981		*sc->irq_deassert = 0;
2982		if (!mxge_deassert_wait)
2983			/* don't wait for conf. that irq is low */
2984			stats->valid = 0;
2985	} else {
2986		stats->valid = 0;
2987	}
2988
2989	/* loop while waiting for legacy irq deassertion */
2990	do {
2991		/* check for transmit completes and receives */
2992		send_done_count = be32toh(stats->send_done_count);
2993		while ((send_done_count != tx->pkt_done) ||
2994		       (rx_done->entry[rx_done->idx].length != 0)) {
2995			if (send_done_count != tx->pkt_done)
2996				mxge_tx_done(ss, (int)send_done_count);
2997			mxge_clean_rx_done(ss);
2998			send_done_count = be32toh(stats->send_done_count);
2999		}
3000		if (sc->legacy_irq && mxge_deassert_wait)
3001			wmb();
3002	} while (*((volatile uint8_t *) &stats->valid));
3003
3004	/* fw link & error stats meaningful only on the first slice */
3005	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3006		if (sc->link_state != stats->link_up) {
3007			sc->link_state = stats->link_up;
3008			if (sc->link_state) {
3009				if_link_state_change(sc->ifp, LINK_STATE_UP);
3010				 sc->ifp->if_baudrate = IF_Gbps(10UL);
3011				if (mxge_verbose)
3012					device_printf(sc->dev, "link up\n");
3013			} else {
3014				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3015				sc->ifp->if_baudrate = 0;
3016				if (mxge_verbose)
3017					device_printf(sc->dev, "link down\n");
3018			}
3019			sc->need_media_probe = 1;
3020		}
3021		if (sc->rdma_tags_available !=
3022		    be32toh(stats->rdma_tags_available)) {
3023			sc->rdma_tags_available =
3024				be32toh(stats->rdma_tags_available);
3025			device_printf(sc->dev, "RDMA timed out! %d tags "
3026				      "left\n", sc->rdma_tags_available);
3027		}
3028
3029		if (stats->link_down) {
3030			sc->down_cnt += stats->link_down;
3031			sc->link_state = 0;
3032			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3033		}
3034	}
3035
3036	/* check to see if we have rx token to pass back */
3037	if (valid & 0x1)
3038	    *ss->irq_claim = be32toh(3);
3039	*(ss->irq_claim + 1) = be32toh(3);
3040}
3041
3042static void
3043mxge_init(void *arg)
3044{
3045}
3046
3047
3048
3049static void
3050mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3051{
3052	struct lro_entry *lro_entry;
3053	int i;
3054
3055	while (!SLIST_EMPTY(&ss->lro_free)) {
3056		lro_entry = SLIST_FIRST(&ss->lro_free);
3057		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3058		free(lro_entry, M_DEVBUF);
3059	}
3060
3061	for (i = 0; i <= ss->rx_big.mask; i++) {
3062		if (ss->rx_big.info[i].m == NULL)
3063			continue;
3064		bus_dmamap_unload(ss->rx_big.dmat,
3065				  ss->rx_big.info[i].map);
3066		m_freem(ss->rx_big.info[i].m);
3067		ss->rx_big.info[i].m = NULL;
3068	}
3069
3070	for (i = 0; i <= ss->rx_small.mask; i++) {
3071		if (ss->rx_small.info[i].m == NULL)
3072			continue;
3073		bus_dmamap_unload(ss->rx_small.dmat,
3074				  ss->rx_small.info[i].map);
3075		m_freem(ss->rx_small.info[i].m);
3076		ss->rx_small.info[i].m = NULL;
3077	}
3078
3079	/* transmit ring used only on the first slice */
3080	if (ss->tx.info == NULL)
3081		return;
3082
3083	for (i = 0; i <= ss->tx.mask; i++) {
3084		ss->tx.info[i].flag = 0;
3085		if (ss->tx.info[i].m == NULL)
3086			continue;
3087		bus_dmamap_unload(ss->tx.dmat,
3088				  ss->tx.info[i].map);
3089		m_freem(ss->tx.info[i].m);
3090		ss->tx.info[i].m = NULL;
3091	}
3092}
3093
3094static void
3095mxge_free_mbufs(mxge_softc_t *sc)
3096{
3097	int slice;
3098
3099	for (slice = 0; slice < sc->num_slices; slice++)
3100		mxge_free_slice_mbufs(&sc->ss[slice]);
3101}
3102
3103static void
3104mxge_free_slice_rings(struct mxge_slice_state *ss)
3105{
3106	int i;
3107
3108
3109	if (ss->rx_done.entry != NULL)
3110		mxge_dma_free(&ss->rx_done.dma);
3111	ss->rx_done.entry = NULL;
3112
3113	if (ss->tx.req_bytes != NULL)
3114		free(ss->tx.req_bytes, M_DEVBUF);
3115	ss->tx.req_bytes = NULL;
3116
3117	if (ss->tx.seg_list != NULL)
3118		free(ss->tx.seg_list, M_DEVBUF);
3119	ss->tx.seg_list = NULL;
3120
3121	if (ss->rx_small.shadow != NULL)
3122		free(ss->rx_small.shadow, M_DEVBUF);
3123	ss->rx_small.shadow = NULL;
3124
3125	if (ss->rx_big.shadow != NULL)
3126		free(ss->rx_big.shadow, M_DEVBUF);
3127	ss->rx_big.shadow = NULL;
3128
3129	if (ss->tx.info != NULL) {
3130		if (ss->tx.dmat != NULL) {
3131			for (i = 0; i <= ss->tx.mask; i++) {
3132				bus_dmamap_destroy(ss->tx.dmat,
3133						   ss->tx.info[i].map);
3134			}
3135			bus_dma_tag_destroy(ss->tx.dmat);
3136		}
3137		free(ss->tx.info, M_DEVBUF);
3138	}
3139	ss->tx.info = NULL;
3140
3141	if (ss->rx_small.info != NULL) {
3142		if (ss->rx_small.dmat != NULL) {
3143			for (i = 0; i <= ss->rx_small.mask; i++) {
3144				bus_dmamap_destroy(ss->rx_small.dmat,
3145						   ss->rx_small.info[i].map);
3146			}
3147			bus_dmamap_destroy(ss->rx_small.dmat,
3148					   ss->rx_small.extra_map);
3149			bus_dma_tag_destroy(ss->rx_small.dmat);
3150		}
3151		free(ss->rx_small.info, M_DEVBUF);
3152	}
3153	ss->rx_small.info = NULL;
3154
3155	if (ss->rx_big.info != NULL) {
3156		if (ss->rx_big.dmat != NULL) {
3157			for (i = 0; i <= ss->rx_big.mask; i++) {
3158				bus_dmamap_destroy(ss->rx_big.dmat,
3159						   ss->rx_big.info[i].map);
3160			}
3161			bus_dmamap_destroy(ss->rx_big.dmat,
3162					   ss->rx_big.extra_map);
3163			bus_dma_tag_destroy(ss->rx_big.dmat);
3164		}
3165		free(ss->rx_big.info, M_DEVBUF);
3166	}
3167	ss->rx_big.info = NULL;
3168}
3169
3170static void
3171mxge_free_rings(mxge_softc_t *sc)
3172{
3173	int slice;
3174
3175	for (slice = 0; slice < sc->num_slices; slice++)
3176		mxge_free_slice_rings(&sc->ss[slice]);
3177}
3178
3179static int
3180mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3181		       int tx_ring_entries)
3182{
3183	mxge_softc_t *sc = ss->sc;
3184	size_t bytes;
3185	int err, i;
3186
3187	err = ENOMEM;
3188
3189	/* allocate per-slice receive resources */
3190
3191	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3192	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3193
3194	/* allocate the rx shadow rings */
3195	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3196	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3197	if (ss->rx_small.shadow == NULL)
3198		return err;
3199
3200	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3201	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3202	if (ss->rx_big.shadow == NULL)
3203		return err;
3204
3205	/* allocate the rx host info rings */
3206	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3207	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3208	if (ss->rx_small.info == NULL)
3209		return err;
3210
3211	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3212	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3213	if (ss->rx_big.info == NULL)
3214		return err;
3215
3216	/* allocate the rx busdma resources */
3217	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3218				 1,			/* alignment */
3219				 4096,			/* boundary */
3220				 BUS_SPACE_MAXADDR,	/* low */
3221				 BUS_SPACE_MAXADDR,	/* high */
3222				 NULL, NULL,		/* filter */
3223				 MHLEN,			/* maxsize */
3224				 1,			/* num segs */
3225				 MHLEN,			/* maxsegsize */
3226				 BUS_DMA_ALLOCNOW,	/* flags */
3227				 NULL, NULL,		/* lock */
3228				 &ss->rx_small.dmat);	/* tag */
3229	if (err != 0) {
3230		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3231			      err);
3232		return err;
3233	}
3234
3235	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3236				 1,			/* alignment */
3237#if MXGE_VIRT_JUMBOS
3238				 4096,			/* boundary */
3239#else
3240				 0,			/* boundary */
3241#endif
3242				 BUS_SPACE_MAXADDR,	/* low */
3243				 BUS_SPACE_MAXADDR,	/* high */
3244				 NULL, NULL,		/* filter */
3245				 3*4096,		/* maxsize */
3246#if MXGE_VIRT_JUMBOS
3247				 3,			/* num segs */
3248				 4096,			/* maxsegsize*/
3249#else
3250				 1,			/* num segs */
3251				 MJUM9BYTES,		/* maxsegsize*/
3252#endif
3253				 BUS_DMA_ALLOCNOW,	/* flags */
3254				 NULL, NULL,		/* lock */
3255				 &ss->rx_big.dmat);	/* tag */
3256	if (err != 0) {
3257		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3258			      err);
3259		return err;
3260	}
3261	for (i = 0; i <= ss->rx_small.mask; i++) {
3262		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3263					&ss->rx_small.info[i].map);
3264		if (err != 0) {
3265			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3266				      err);
3267			return err;
3268		}
3269	}
3270	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3271				&ss->rx_small.extra_map);
3272	if (err != 0) {
3273		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3274			      err);
3275		return err;
3276	}
3277
3278	for (i = 0; i <= ss->rx_big.mask; i++) {
3279		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3280					&ss->rx_big.info[i].map);
3281		if (err != 0) {
3282			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3283				      err);
3284			return err;
3285		}
3286	}
3287	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3288				&ss->rx_big.extra_map);
3289	if (err != 0) {
3290		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3291			      err);
3292		return err;
3293	}
3294
3295	/* now allocate TX resouces */
3296
3297#ifndef IFNET_BUF_RING
3298	/* only use a single TX ring for now */
3299	if (ss != ss->sc->ss)
3300		return 0;
3301#endif
3302
3303	ss->tx.mask = tx_ring_entries - 1;
3304	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3305
3306
3307	/* allocate the tx request copy block */
3308	bytes = 8 +
3309		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3310	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3311	if (ss->tx.req_bytes == NULL)
3312		return err;
3313	/* ensure req_list entries are aligned to 8 bytes */
3314	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3315		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3316
3317	/* allocate the tx busdma segment list */
3318	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3319	ss->tx.seg_list = (bus_dma_segment_t *)
3320		malloc(bytes, M_DEVBUF, M_WAITOK);
3321	if (ss->tx.seg_list == NULL)
3322		return err;
3323
3324	/* allocate the tx host info ring */
3325	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3326	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3327	if (ss->tx.info == NULL)
3328		return err;
3329
3330	/* allocate the tx busdma resources */
3331	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3332				 1,			/* alignment */
3333				 sc->tx_boundary,	/* boundary */
3334				 BUS_SPACE_MAXADDR,	/* low */
3335				 BUS_SPACE_MAXADDR,	/* high */
3336				 NULL, NULL,		/* filter */
3337				 65536 + 256,		/* maxsize */
3338				 ss->tx.max_desc - 2,	/* num segs */
3339				 sc->tx_boundary,	/* maxsegsz */
3340				 BUS_DMA_ALLOCNOW,	/* flags */
3341				 NULL, NULL,		/* lock */
3342				 &ss->tx.dmat);		/* tag */
3343
3344	if (err != 0) {
3345		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3346			      err);
3347		return err;
3348	}
3349
3350	/* now use these tags to setup dmamaps for each slot
3351	   in the ring */
3352	for (i = 0; i <= ss->tx.mask; i++) {
3353		err = bus_dmamap_create(ss->tx.dmat, 0,
3354					&ss->tx.info[i].map);
3355		if (err != 0) {
3356			device_printf(sc->dev, "Err %d  tx dmamap\n",
3357				      err);
3358			return err;
3359		}
3360	}
3361	return 0;
3362
3363}
3364
3365static int
3366mxge_alloc_rings(mxge_softc_t *sc)
3367{
3368	mxge_cmd_t cmd;
3369	int tx_ring_size;
3370	int tx_ring_entries, rx_ring_entries;
3371	int err, slice;
3372
3373	/* get ring sizes */
3374	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3375	tx_ring_size = cmd.data0;
3376	if (err != 0) {
3377		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3378		goto abort;
3379	}
3380
3381	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3382	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3383	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3384	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3385	IFQ_SET_READY(&sc->ifp->if_snd);
3386
3387	for (slice = 0; slice < sc->num_slices; slice++) {
3388		err = mxge_alloc_slice_rings(&sc->ss[slice],
3389					     rx_ring_entries,
3390					     tx_ring_entries);
3391		if (err != 0)
3392			goto abort;
3393	}
3394	return 0;
3395
3396abort:
3397	mxge_free_rings(sc);
3398	return err;
3399
3400}
3401
3402
3403static void
3404mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3405{
3406	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3407
3408	if (bufsize < MCLBYTES) {
3409		/* easy, everything fits in a single buffer */
3410		*big_buf_size = MCLBYTES;
3411		*cl_size = MCLBYTES;
3412		*nbufs = 1;
3413		return;
3414	}
3415
3416	if (bufsize < MJUMPAGESIZE) {
3417		/* still easy, everything still fits in a single buffer */
3418		*big_buf_size = MJUMPAGESIZE;
3419		*cl_size = MJUMPAGESIZE;
3420		*nbufs = 1;
3421		return;
3422	}
3423#if MXGE_VIRT_JUMBOS
3424	/* now we need to use virtually contiguous buffers */
3425	*cl_size = MJUM9BYTES;
3426	*big_buf_size = 4096;
3427	*nbufs = mtu / 4096 + 1;
3428	/* needs to be a power of two, so round up */
3429	if (*nbufs == 3)
3430		*nbufs = 4;
3431#else
3432	*cl_size = MJUM9BYTES;
3433	*big_buf_size = MJUM9BYTES;
3434	*nbufs = 1;
3435#endif
3436}
3437
3438static int
3439mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3440{
3441	mxge_softc_t *sc;
3442	mxge_cmd_t cmd;
3443	bus_dmamap_t map;
3444	struct lro_entry *lro_entry;
3445	int err, i, slice;
3446
3447
3448	sc = ss->sc;
3449	slice = ss - sc->ss;
3450
3451	SLIST_INIT(&ss->lro_free);
3452	SLIST_INIT(&ss->lro_active);
3453
3454	for (i = 0; i < sc->lro_cnt; i++) {
3455		lro_entry = (struct lro_entry *)
3456			malloc(sizeof (*lro_entry), M_DEVBUF,
3457			       M_NOWAIT | M_ZERO);
3458		if (lro_entry == NULL) {
3459			sc->lro_cnt = i;
3460			break;
3461		}
3462		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3463	}
3464	/* get the lanai pointers to the send and receive rings */
3465
3466	err = 0;
3467#ifndef IFNET_BUF_RING
3468	/* We currently only send from the first slice */
3469	if (slice == 0) {
3470#endif
3471		cmd.data0 = slice;
3472		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3473		ss->tx.lanai =
3474			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3475		ss->tx.send_go = (volatile uint32_t *)
3476			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3477		ss->tx.send_stop = (volatile uint32_t *)
3478		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3479#ifndef IFNET_BUF_RING
3480	}
3481#endif
3482	cmd.data0 = slice;
3483	err |= mxge_send_cmd(sc,
3484			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3485	ss->rx_small.lanai =
3486		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3487	cmd.data0 = slice;
3488	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3489	ss->rx_big.lanai =
3490		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3491
3492	if (err != 0) {
3493		device_printf(sc->dev,
3494			      "failed to get ring sizes or locations\n");
3495		return EIO;
3496	}
3497
3498	/* stock receive rings */
3499	for (i = 0; i <= ss->rx_small.mask; i++) {
3500		map = ss->rx_small.info[i].map;
3501		err = mxge_get_buf_small(ss, map, i);
3502		if (err) {
3503			device_printf(sc->dev, "alloced %d/%d smalls\n",
3504				      i, ss->rx_small.mask + 1);
3505			return ENOMEM;
3506		}
3507	}
3508	for (i = 0; i <= ss->rx_big.mask; i++) {
3509		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3510		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3511	}
3512	ss->rx_big.nbufs = nbufs;
3513	ss->rx_big.cl_size = cl_size;
3514	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3515		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3516	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3517		map = ss->rx_big.info[i].map;
3518		err = mxge_get_buf_big(ss, map, i);
3519		if (err) {
3520			device_printf(sc->dev, "alloced %d/%d bigs\n",
3521				      i, ss->rx_big.mask + 1);
3522			return ENOMEM;
3523		}
3524	}
3525	return 0;
3526}
3527
3528static int
3529mxge_open(mxge_softc_t *sc)
3530{
3531	mxge_cmd_t cmd;
3532	int err, big_bytes, nbufs, slice, cl_size, i;
3533	bus_addr_t bus;
3534	volatile uint8_t *itable;
3535	struct mxge_slice_state *ss;
3536
3537	/* Copy the MAC address in case it was overridden */
3538	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3539
3540	err = mxge_reset(sc, 1);
3541	if (err != 0) {
3542		device_printf(sc->dev, "failed to reset\n");
3543		return EIO;
3544	}
3545
3546	if (sc->num_slices > 1) {
3547		/* setup the indirection table */
3548		cmd.data0 = sc->num_slices;
3549		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3550				    &cmd);
3551
3552		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3553				     &cmd);
3554		if (err != 0) {
3555			device_printf(sc->dev,
3556				      "failed to setup rss tables\n");
3557			return err;
3558		}
3559
3560		/* just enable an identity mapping */
3561		itable = sc->sram + cmd.data0;
3562		for (i = 0; i < sc->num_slices; i++)
3563			itable[i] = (uint8_t)i;
3564
3565		cmd.data0 = 1;
3566		cmd.data1 = mxge_rss_hash_type;
3567		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3568		if (err != 0) {
3569			device_printf(sc->dev, "failed to enable slices\n");
3570			return err;
3571		}
3572	}
3573
3574
3575	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3576
3577	cmd.data0 = nbufs;
3578	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3579			    &cmd);
3580	/* error is only meaningful if we're trying to set
3581	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3582	if (err && nbufs > 1) {
3583		device_printf(sc->dev,
3584			      "Failed to set alway-use-n to %d\n",
3585			      nbufs);
3586		return EIO;
3587	}
3588	/* Give the firmware the mtu and the big and small buffer
3589	   sizes.  The firmware wants the big buf size to be a power
3590	   of two. Luckily, FreeBSD's clusters are powers of two */
3591	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3592	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3593	cmd.data0 = MHLEN - MXGEFW_PAD;
3594	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3595			     &cmd);
3596	cmd.data0 = big_bytes;
3597	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3598
3599	if (err != 0) {
3600		device_printf(sc->dev, "failed to setup params\n");
3601		goto abort;
3602	}
3603
3604	/* Now give him the pointer to the stats block */
3605	for (slice = 0;
3606#ifdef IFNET_BUF_RING
3607	     slice < sc->num_slices;
3608#else
3609	     slice < 1;
3610#endif
3611	     slice++) {
3612		ss = &sc->ss[slice];
3613		cmd.data0 =
3614			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3615		cmd.data1 =
3616			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3617		cmd.data2 = sizeof(struct mcp_irq_data);
3618		cmd.data2 |= (slice << 16);
3619		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3620	}
3621
3622	if (err != 0) {
3623		bus = sc->ss->fw_stats_dma.bus_addr;
3624		bus += offsetof(struct mcp_irq_data, send_done_count);
3625		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3626		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3627		err = mxge_send_cmd(sc,
3628				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3629				    &cmd);
3630		/* Firmware cannot support multicast without STATS_DMA_V2 */
3631		sc->fw_multicast_support = 0;
3632	} else {
3633		sc->fw_multicast_support = 1;
3634	}
3635
3636	if (err != 0) {
3637		device_printf(sc->dev, "failed to setup params\n");
3638		goto abort;
3639	}
3640
3641	for (slice = 0; slice < sc->num_slices; slice++) {
3642		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3643		if (err != 0) {
3644			device_printf(sc->dev, "couldn't open slice %d\n",
3645				      slice);
3646			goto abort;
3647		}
3648	}
3649
3650	/* Finally, start the firmware running */
3651	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3652	if (err) {
3653		device_printf(sc->dev, "Couldn't bring up link\n");
3654		goto abort;
3655	}
3656#ifdef IFNET_BUF_RING
3657	for (slice = 0; slice < sc->num_slices; slice++) {
3658		ss = &sc->ss[slice];
3659		ss->if_drv_flags |= IFF_DRV_RUNNING;
3660		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3661	}
3662#endif
3663	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3664	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3665
3666	return 0;
3667
3668
3669abort:
3670	mxge_free_mbufs(sc);
3671
3672	return err;
3673}
3674
3675static int
3676mxge_close(mxge_softc_t *sc, int down)
3677{
3678	mxge_cmd_t cmd;
3679	int err, old_down_cnt;
3680#ifdef IFNET_BUF_RING
3681	struct mxge_slice_state *ss;
3682	int slice;
3683#endif
3684
3685#ifdef IFNET_BUF_RING
3686	for (slice = 0; slice < sc->num_slices; slice++) {
3687		ss = &sc->ss[slice];
3688		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3689	}
3690#endif
3691	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3692	if (!down) {
3693		old_down_cnt = sc->down_cnt;
3694		wmb();
3695		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3696		if (err) {
3697			device_printf(sc->dev,
3698				      "Couldn't bring down link\n");
3699		}
3700		if (old_down_cnt == sc->down_cnt) {
3701			/* wait for down irq */
3702			DELAY(10 * sc->intr_coal_delay);
3703		}
3704		wmb();
3705		if (old_down_cnt == sc->down_cnt) {
3706			device_printf(sc->dev, "never got down irq\n");
3707		}
3708	}
3709	mxge_free_mbufs(sc);
3710
3711	return 0;
3712}
3713
3714static void
3715mxge_setup_cfg_space(mxge_softc_t *sc)
3716{
3717	device_t dev = sc->dev;
3718	int reg;
3719	uint16_t cmd, lnk, pectl;
3720
3721	/* find the PCIe link width and set max read request to 4KB*/
3722	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3723		lnk = pci_read_config(dev, reg + 0x12, 2);
3724		sc->link_width = (lnk >> 4) & 0x3f;
3725
3726		if (sc->pectl == 0) {
3727			pectl = pci_read_config(dev, reg + 0x8, 2);
3728			pectl = (pectl & ~0x7000) | (5 << 12);
3729			pci_write_config(dev, reg + 0x8, pectl, 2);
3730			sc->pectl = pectl;
3731		} else {
3732			/* restore saved pectl after watchdog reset */
3733			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3734		}
3735	}
3736
3737	/* Enable DMA and Memory space access */
3738	pci_enable_busmaster(dev);
3739	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3740	cmd |= PCIM_CMD_MEMEN;
3741	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3742}
3743
3744static uint32_t
3745mxge_read_reboot(mxge_softc_t *sc)
3746{
3747	device_t dev = sc->dev;
3748	uint32_t vs;
3749
3750	/* find the vendor specific offset */
3751	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3752		device_printf(sc->dev,
3753			      "could not find vendor specific offset\n");
3754		return (uint32_t)-1;
3755	}
3756	/* enable read32 mode */
3757	pci_write_config(dev, vs + 0x10, 0x3, 1);
3758	/* tell NIC which register to read */
3759	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3760	return (pci_read_config(dev, vs + 0x14, 4));
3761}
3762
3763static void
3764mxge_watchdog_reset(mxge_softc_t *sc)
3765{
3766	struct pci_devinfo *dinfo;
3767	struct mxge_slice_state *ss;
3768	int err, running, s, num_tx_slices = 1;
3769	uint32_t reboot;
3770	uint16_t cmd;
3771
3772	err = ENXIO;
3773
3774	device_printf(sc->dev, "Watchdog reset!\n");
3775
3776	/*
3777	 * check to see if the NIC rebooted.  If it did, then all of
3778	 * PCI config space has been reset, and things like the
3779	 * busmaster bit will be zero.  If this is the case, then we
3780	 * must restore PCI config space before the NIC can be used
3781	 * again
3782	 */
3783	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3784	if (cmd == 0xffff) {
3785		/*
3786		 * maybe the watchdog caught the NIC rebooting; wait
3787		 * up to 100ms for it to finish.  If it does not come
3788		 * back, then give up
3789		 */
3790		DELAY(1000*100);
3791		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3792		if (cmd == 0xffff) {
3793			device_printf(sc->dev, "NIC disappeared!\n");
3794		}
3795	}
3796	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3797		/* print the reboot status */
3798		reboot = mxge_read_reboot(sc);
3799		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3800			      reboot);
3801		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3802		if (running) {
3803
3804			/*
3805			 * quiesce NIC so that TX routines will not try to
3806			 * xmit after restoration of BAR
3807			 */
3808
3809			/* Mark the link as down */
3810			if (sc->link_state) {
3811				sc->link_state = 0;
3812				if_link_state_change(sc->ifp,
3813						     LINK_STATE_DOWN);
3814			}
3815#ifdef IFNET_BUF_RING
3816			num_tx_slices = sc->num_slices;
3817#endif
3818			/* grab all TX locks to ensure no tx  */
3819			for (s = 0; s < num_tx_slices; s++) {
3820				ss = &sc->ss[s];
3821				mtx_lock(&ss->tx.mtx);
3822			}
3823			mxge_close(sc, 1);
3824		}
3825		/* restore PCI configuration space */
3826		dinfo = device_get_ivars(sc->dev);
3827		pci_cfg_restore(sc->dev, dinfo);
3828
3829		/* and redo any changes we made to our config space */
3830		mxge_setup_cfg_space(sc);
3831
3832		/* reload f/w */
3833		err = mxge_load_firmware(sc, 0);
3834		if (err) {
3835			device_printf(sc->dev,
3836				      "Unable to re-load f/w\n");
3837		}
3838		if (running) {
3839			if (!err)
3840				err = mxge_open(sc);
3841			/* release all TX locks */
3842			for (s = 0; s < num_tx_slices; s++) {
3843				ss = &sc->ss[s];
3844#ifdef IFNET_BUF_RING
3845				mxge_start_locked(ss);
3846#endif
3847				mtx_unlock(&ss->tx.mtx);
3848			}
3849		}
3850		sc->watchdog_resets++;
3851	} else {
3852		device_printf(sc->dev,
3853			      "NIC did not reboot, not resetting\n");
3854		err = 0;
3855	}
3856	if (err) {
3857		device_printf(sc->dev, "watchdog reset failed\n");
3858	} else {
3859		if (sc->dying == 2)
3860			sc->dying = 0;
3861		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3862	}
3863}
3864
3865static void
3866mxge_watchdog_task(void *arg, int pending)
3867{
3868	mxge_softc_t *sc = arg;
3869
3870
3871	mtx_lock(&sc->driver_mtx);
3872	mxge_watchdog_reset(sc);
3873	mtx_unlock(&sc->driver_mtx);
3874}
3875
3876static void
3877mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3878{
3879	tx = &sc->ss[slice].tx;
3880	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3881	device_printf(sc->dev,
3882		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3883		      tx->req, tx->done, tx->queue_active);
3884	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3885			      tx->activate, tx->deactivate);
3886	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3887		      tx->pkt_done,
3888		      be32toh(sc->ss->fw_stats->send_done_count));
3889}
3890
3891static int
3892mxge_watchdog(mxge_softc_t *sc)
3893{
3894	mxge_tx_ring_t *tx;
3895	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3896	int i, err = 0;
3897
3898	/* see if we have outstanding transmits, which
3899	   have been pending for more than mxge_ticks */
3900	for (i = 0;
3901#ifdef IFNET_BUF_RING
3902	     (i < sc->num_slices) && (err == 0);
3903#else
3904	     (i < 1) && (err == 0);
3905#endif
3906	     i++) {
3907		tx = &sc->ss[i].tx;
3908		if (tx->req != tx->done &&
3909		    tx->watchdog_req != tx->watchdog_done &&
3910		    tx->done == tx->watchdog_done) {
3911			/* check for pause blocking before resetting */
3912			if (tx->watchdog_rx_pause == rx_pause) {
3913				mxge_warn_stuck(sc, tx, i);
3914				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3915				return (ENXIO);
3916			}
3917			else
3918				device_printf(sc->dev, "Flow control blocking "
3919					      "xmits, check link partner\n");
3920		}
3921
3922		tx->watchdog_req = tx->req;
3923		tx->watchdog_done = tx->done;
3924		tx->watchdog_rx_pause = rx_pause;
3925	}
3926
3927	if (sc->need_media_probe)
3928		mxge_media_probe(sc);
3929	return (err);
3930}
3931
3932static u_long
3933mxge_update_stats(mxge_softc_t *sc)
3934{
3935	struct mxge_slice_state *ss;
3936	u_long pkts = 0;
3937	u_long ipackets = 0;
3938	u_long opackets = 0;
3939#ifdef IFNET_BUF_RING
3940	u_long obytes = 0;
3941	u_long omcasts = 0;
3942	u_long odrops = 0;
3943#endif
3944	u_long oerrors = 0;
3945	int slice;
3946
3947	for (slice = 0; slice < sc->num_slices; slice++) {
3948		ss = &sc->ss[slice];
3949		ipackets += ss->ipackets;
3950		opackets += ss->opackets;
3951#ifdef IFNET_BUF_RING
3952		obytes += ss->obytes;
3953		omcasts += ss->omcasts;
3954		odrops += ss->tx.br->br_drops;
3955#endif
3956		oerrors += ss->oerrors;
3957	}
3958	pkts = (ipackets - sc->ifp->if_ipackets);
3959	pkts += (opackets - sc->ifp->if_opackets);
3960	sc->ifp->if_ipackets = ipackets;
3961	sc->ifp->if_opackets = opackets;
3962#ifdef IFNET_BUF_RING
3963	sc->ifp->if_obytes = obytes;
3964	sc->ifp->if_omcasts = omcasts;
3965	sc->ifp->if_snd.ifq_drops = odrops;
3966#endif
3967	sc->ifp->if_oerrors = oerrors;
3968	return pkts;
3969}
3970
3971static void
3972mxge_tick(void *arg)
3973{
3974	mxge_softc_t *sc = arg;
3975	u_long pkts = 0;
3976	int err = 0;
3977	int running, ticks;
3978	uint16_t cmd;
3979
3980	ticks = mxge_ticks;
3981	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3982	if (running) {
3983		/* aggregate stats from different slices */
3984		pkts = mxge_update_stats(sc);
3985		if (!sc->watchdog_countdown) {
3986			err = mxge_watchdog(sc);
3987			sc->watchdog_countdown = 4;
3988		}
3989		sc->watchdog_countdown--;
3990	}
3991	if (pkts == 0) {
3992		/* ensure NIC did not suffer h/w fault while idle */
3993		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3994		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3995			sc->dying = 2;
3996			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3997			err = ENXIO;
3998		}
3999		/* look less often if NIC is idle */
4000		ticks *= 4;
4001	}
4002
4003	if (err == 0)
4004		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4005
4006}
4007
4008static int
4009mxge_media_change(struct ifnet *ifp)
4010{
4011	return EINVAL;
4012}
4013
4014static int
4015mxge_change_mtu(mxge_softc_t *sc, int mtu)
4016{
4017	struct ifnet *ifp = sc->ifp;
4018	int real_mtu, old_mtu;
4019	int err = 0;
4020
4021
4022	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4023	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4024		return EINVAL;
4025	mtx_lock(&sc->driver_mtx);
4026	old_mtu = ifp->if_mtu;
4027	ifp->if_mtu = mtu;
4028	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4029		mxge_close(sc, 0);
4030		err = mxge_open(sc);
4031		if (err != 0) {
4032			ifp->if_mtu = old_mtu;
4033			mxge_close(sc, 0);
4034			(void) mxge_open(sc);
4035		}
4036	}
4037	mtx_unlock(&sc->driver_mtx);
4038	return err;
4039}
4040
4041static void
4042mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4043{
4044	mxge_softc_t *sc = ifp->if_softc;
4045
4046
4047	if (sc == NULL)
4048		return;
4049	ifmr->ifm_status = IFM_AVALID;
4050	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4051	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4052	ifmr->ifm_active |= sc->current_media;
4053}
4054
4055static int
4056mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4057{
4058	mxge_softc_t *sc = ifp->if_softc;
4059	struct ifreq *ifr = (struct ifreq *)data;
4060	int err, mask;
4061
4062	err = 0;
4063	switch (command) {
4064	case SIOCSIFADDR:
4065	case SIOCGIFADDR:
4066		err = ether_ioctl(ifp, command, data);
4067		break;
4068
4069	case SIOCSIFMTU:
4070		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4071		break;
4072
4073	case SIOCSIFFLAGS:
4074		mtx_lock(&sc->driver_mtx);
4075		if (sc->dying) {
4076			mtx_unlock(&sc->driver_mtx);
4077			return EINVAL;
4078		}
4079		if (ifp->if_flags & IFF_UP) {
4080			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4081				err = mxge_open(sc);
4082			} else {
4083				/* take care of promis can allmulti
4084				   flag chages */
4085				mxge_change_promisc(sc,
4086						    ifp->if_flags & IFF_PROMISC);
4087				mxge_set_multicast_list(sc);
4088			}
4089		} else {
4090			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4091				mxge_close(sc, 0);
4092			}
4093		}
4094		mtx_unlock(&sc->driver_mtx);
4095		break;
4096
4097	case SIOCADDMULTI:
4098	case SIOCDELMULTI:
4099		mtx_lock(&sc->driver_mtx);
4100		mxge_set_multicast_list(sc);
4101		mtx_unlock(&sc->driver_mtx);
4102		break;
4103
4104	case SIOCSIFCAP:
4105		mtx_lock(&sc->driver_mtx);
4106		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4107		if (mask & IFCAP_TXCSUM) {
4108			if (IFCAP_TXCSUM & ifp->if_capenable) {
4109				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4110				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4111						      | CSUM_TSO);
4112			} else {
4113				ifp->if_capenable |= IFCAP_TXCSUM;
4114				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4115			}
4116		} else if (mask & IFCAP_RXCSUM) {
4117			if (IFCAP_RXCSUM & ifp->if_capenable) {
4118				ifp->if_capenable &= ~IFCAP_RXCSUM;
4119				sc->csum_flag = 0;
4120			} else {
4121				ifp->if_capenable |= IFCAP_RXCSUM;
4122				sc->csum_flag = 1;
4123			}
4124		}
4125		if (mask & IFCAP_TSO4) {
4126			if (IFCAP_TSO4 & ifp->if_capenable) {
4127				ifp->if_capenable &= ~IFCAP_TSO4;
4128				ifp->if_hwassist &= ~CSUM_TSO;
4129			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4130				ifp->if_capenable |= IFCAP_TSO4;
4131				ifp->if_hwassist |= CSUM_TSO;
4132			} else {
4133				printf("mxge requires tx checksum offload"
4134				       " be enabled to use TSO\n");
4135				err = EINVAL;
4136			}
4137		}
4138		if (mask & IFCAP_LRO) {
4139			if (IFCAP_LRO & ifp->if_capenable)
4140				err = mxge_change_lro_locked(sc, 0);
4141			else
4142				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4143		}
4144		if (mask & IFCAP_VLAN_HWTAGGING)
4145			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4146		if (mask & IFCAP_VLAN_HWTSO)
4147			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4148
4149		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4150		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4151			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4152
4153		mtx_unlock(&sc->driver_mtx);
4154		VLAN_CAPABILITIES(ifp);
4155
4156		break;
4157
4158	case SIOCGIFMEDIA:
4159		mtx_lock(&sc->driver_mtx);
4160		mxge_media_probe(sc);
4161		mtx_unlock(&sc->driver_mtx);
4162		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4163				    &sc->media, command);
4164                break;
4165
4166	default:
4167		err = ENOTTY;
4168        }
4169	return err;
4170}
4171
4172static void
4173mxge_fetch_tunables(mxge_softc_t *sc)
4174{
4175
4176	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4177	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4178			  &mxge_flow_control);
4179	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4180			  &mxge_intr_coal_delay);
4181	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4182			  &mxge_nvidia_ecrc_enable);
4183	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4184			  &mxge_force_firmware);
4185	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4186			  &mxge_deassert_wait);
4187	TUNABLE_INT_FETCH("hw.mxge.verbose",
4188			  &mxge_verbose);
4189	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4190	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4191	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4192	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4193	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4194	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4195	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4196	if (sc->lro_cnt != 0)
4197		mxge_lro_cnt = sc->lro_cnt;
4198
4199	if (bootverbose)
4200		mxge_verbose = 1;
4201	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4202		mxge_intr_coal_delay = 30;
4203	if (mxge_ticks == 0)
4204		mxge_ticks = hz / 2;
4205	sc->pause = mxge_flow_control;
4206	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4207	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4208		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4209	}
4210	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4211	    mxge_initial_mtu < ETHER_MIN_LEN)
4212		mxge_initial_mtu = ETHERMTU_JUMBO;
4213
4214	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4215		mxge_throttle = MXGE_MAX_THROTTLE;
4216	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4217		mxge_throttle = MXGE_MIN_THROTTLE;
4218	sc->throttle = mxge_throttle;
4219}
4220
4221
4222static void
4223mxge_free_slices(mxge_softc_t *sc)
4224{
4225	struct mxge_slice_state *ss;
4226	int i;
4227
4228
4229	if (sc->ss == NULL)
4230		return;
4231
4232	for (i = 0; i < sc->num_slices; i++) {
4233		ss = &sc->ss[i];
4234		if (ss->fw_stats != NULL) {
4235			mxge_dma_free(&ss->fw_stats_dma);
4236			ss->fw_stats = NULL;
4237#ifdef IFNET_BUF_RING
4238			if (ss->tx.br != NULL) {
4239				drbr_free(ss->tx.br, M_DEVBUF);
4240				ss->tx.br = NULL;
4241			}
4242#endif
4243			mtx_destroy(&ss->tx.mtx);
4244		}
4245		if (ss->rx_done.entry != NULL) {
4246			mxge_dma_free(&ss->rx_done.dma);
4247			ss->rx_done.entry = NULL;
4248		}
4249	}
4250	free(sc->ss, M_DEVBUF);
4251	sc->ss = NULL;
4252}
4253
4254static int
4255mxge_alloc_slices(mxge_softc_t *sc)
4256{
4257	mxge_cmd_t cmd;
4258	struct mxge_slice_state *ss;
4259	size_t bytes;
4260	int err, i, max_intr_slots;
4261
4262	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4263	if (err != 0) {
4264		device_printf(sc->dev, "Cannot determine rx ring size\n");
4265		return err;
4266	}
4267	sc->rx_ring_size = cmd.data0;
4268	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4269
4270	bytes = sizeof (*sc->ss) * sc->num_slices;
4271	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4272	if (sc->ss == NULL)
4273		return (ENOMEM);
4274	for (i = 0; i < sc->num_slices; i++) {
4275		ss = &sc->ss[i];
4276
4277		ss->sc = sc;
4278
4279		/* allocate per-slice rx interrupt queues */
4280
4281		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4282		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4283		if (err != 0)
4284			goto abort;
4285		ss->rx_done.entry = ss->rx_done.dma.addr;
4286		bzero(ss->rx_done.entry, bytes);
4287
4288		/*
4289		 * allocate the per-slice firmware stats; stats
4290		 * (including tx) are used used only on the first
4291		 * slice for now
4292		 */
4293#ifndef IFNET_BUF_RING
4294		if (i > 0)
4295			continue;
4296#endif
4297
4298		bytes = sizeof (*ss->fw_stats);
4299		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4300				     sizeof (*ss->fw_stats), 64);
4301		if (err != 0)
4302			goto abort;
4303		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4304		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4305			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4306		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4307#ifdef IFNET_BUF_RING
4308		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4309					   &ss->tx.mtx);
4310#endif
4311	}
4312
4313	return (0);
4314
4315abort:
4316	mxge_free_slices(sc);
4317	return (ENOMEM);
4318}
4319
4320static void
4321mxge_slice_probe(mxge_softc_t *sc)
4322{
4323	mxge_cmd_t cmd;
4324	char *old_fw;
4325	int msix_cnt, status, max_intr_slots;
4326
4327	sc->num_slices = 1;
4328	/*
4329	 *  don't enable multiple slices if they are not enabled,
4330	 *  or if this is not an SMP system
4331	 */
4332
4333	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4334		return;
4335
4336	/* see how many MSI-X interrupts are available */
4337	msix_cnt = pci_msix_count(sc->dev);
4338	if (msix_cnt < 2)
4339		return;
4340
4341	/* now load the slice aware firmware see what it supports */
4342	old_fw = sc->fw_name;
4343	if (old_fw == mxge_fw_aligned)
4344		sc->fw_name = mxge_fw_rss_aligned;
4345	else
4346		sc->fw_name = mxge_fw_rss_unaligned;
4347	status = mxge_load_firmware(sc, 0);
4348	if (status != 0) {
4349		device_printf(sc->dev, "Falling back to a single slice\n");
4350		return;
4351	}
4352
4353	/* try to send a reset command to the card to see if it
4354	   is alive */
4355	memset(&cmd, 0, sizeof (cmd));
4356	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4357	if (status != 0) {
4358		device_printf(sc->dev, "failed reset\n");
4359		goto abort_with_fw;
4360	}
4361
4362	/* get rx ring size */
4363	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4364	if (status != 0) {
4365		device_printf(sc->dev, "Cannot determine rx ring size\n");
4366		goto abort_with_fw;
4367	}
4368	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4369
4370	/* tell it the size of the interrupt queues */
4371	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4372	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4373	if (status != 0) {
4374		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4375		goto abort_with_fw;
4376	}
4377
4378	/* ask the maximum number of slices it supports */
4379	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4380	if (status != 0) {
4381		device_printf(sc->dev,
4382			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4383		goto abort_with_fw;
4384	}
4385	sc->num_slices = cmd.data0;
4386	if (sc->num_slices > msix_cnt)
4387		sc->num_slices = msix_cnt;
4388
4389	if (mxge_max_slices == -1) {
4390		/* cap to number of CPUs in system */
4391		if (sc->num_slices > mp_ncpus)
4392			sc->num_slices = mp_ncpus;
4393	} else {
4394		if (sc->num_slices > mxge_max_slices)
4395			sc->num_slices = mxge_max_slices;
4396	}
4397	/* make sure it is a power of two */
4398	while (sc->num_slices & (sc->num_slices - 1))
4399		sc->num_slices--;
4400
4401	if (mxge_verbose)
4402		device_printf(sc->dev, "using %d slices\n",
4403			      sc->num_slices);
4404
4405	return;
4406
4407abort_with_fw:
4408	sc->fw_name = old_fw;
4409	(void) mxge_load_firmware(sc, 0);
4410}
4411
4412static int
4413mxge_add_msix_irqs(mxge_softc_t *sc)
4414{
4415	size_t bytes;
4416	int count, err, i, rid;
4417
4418	rid = PCIR_BAR(2);
4419	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4420						    &rid, RF_ACTIVE);
4421
4422	if (sc->msix_table_res == NULL) {
4423		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4424		return ENXIO;
4425	}
4426
4427	count = sc->num_slices;
4428	err = pci_alloc_msix(sc->dev, &count);
4429	if (err != 0) {
4430		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4431			      "err = %d \n", sc->num_slices, err);
4432		goto abort_with_msix_table;
4433	}
4434	if (count < sc->num_slices) {
4435		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4436			      count, sc->num_slices);
4437		device_printf(sc->dev,
4438			      "Try setting hw.mxge.max_slices to %d\n",
4439			      count);
4440		err = ENOSPC;
4441		goto abort_with_msix;
4442	}
4443	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4444	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4445	if (sc->msix_irq_res == NULL) {
4446		err = ENOMEM;
4447		goto abort_with_msix;
4448	}
4449
4450	for (i = 0; i < sc->num_slices; i++) {
4451		rid = i + 1;
4452		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4453							  SYS_RES_IRQ,
4454							  &rid, RF_ACTIVE);
4455		if (sc->msix_irq_res[i] == NULL) {
4456			device_printf(sc->dev, "couldn't allocate IRQ res"
4457				      " for message %d\n", i);
4458			err = ENXIO;
4459			goto abort_with_res;
4460		}
4461	}
4462
4463	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4464	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4465
4466	for (i = 0; i < sc->num_slices; i++) {
4467		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4468				     INTR_TYPE_NET | INTR_MPSAFE,
4469#if __FreeBSD_version > 700030
4470				     NULL,
4471#endif
4472				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4473		if (err != 0) {
4474			device_printf(sc->dev, "couldn't setup intr for "
4475				      "message %d\n", i);
4476			goto abort_with_intr;
4477		}
4478	}
4479
4480	if (mxge_verbose) {
4481		device_printf(sc->dev, "using %d msix IRQs:",
4482			      sc->num_slices);
4483		for (i = 0; i < sc->num_slices; i++)
4484			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4485		printf("\n");
4486	}
4487	return (0);
4488
4489abort_with_intr:
4490	for (i = 0; i < sc->num_slices; i++) {
4491		if (sc->msix_ih[i] != NULL) {
4492			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4493					  sc->msix_ih[i]);
4494			sc->msix_ih[i] = NULL;
4495		}
4496	}
4497	free(sc->msix_ih, M_DEVBUF);
4498
4499
4500abort_with_res:
4501	for (i = 0; i < sc->num_slices; i++) {
4502		rid = i + 1;
4503		if (sc->msix_irq_res[i] != NULL)
4504			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4505					     sc->msix_irq_res[i]);
4506		sc->msix_irq_res[i] = NULL;
4507	}
4508	free(sc->msix_irq_res, M_DEVBUF);
4509
4510
4511abort_with_msix:
4512	pci_release_msi(sc->dev);
4513
4514abort_with_msix_table:
4515	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4516			     sc->msix_table_res);
4517
4518	return err;
4519}
4520
4521static int
4522mxge_add_single_irq(mxge_softc_t *sc)
4523{
4524	int count, err, rid;
4525
4526	count = pci_msi_count(sc->dev);
4527	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4528		rid = 1;
4529	} else {
4530		rid = 0;
4531		sc->legacy_irq = 1;
4532	}
4533	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4534					 1, RF_SHAREABLE | RF_ACTIVE);
4535	if (sc->irq_res == NULL) {
4536		device_printf(sc->dev, "could not alloc interrupt\n");
4537		return ENXIO;
4538	}
4539	if (mxge_verbose)
4540		device_printf(sc->dev, "using %s irq %ld\n",
4541			      sc->legacy_irq ? "INTx" : "MSI",
4542			      rman_get_start(sc->irq_res));
4543	err = bus_setup_intr(sc->dev, sc->irq_res,
4544			     INTR_TYPE_NET | INTR_MPSAFE,
4545#if __FreeBSD_version > 700030
4546			     NULL,
4547#endif
4548			     mxge_intr, &sc->ss[0], &sc->ih);
4549	if (err != 0) {
4550		bus_release_resource(sc->dev, SYS_RES_IRQ,
4551				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4552		if (!sc->legacy_irq)
4553			pci_release_msi(sc->dev);
4554	}
4555	return err;
4556}
4557
4558static void
4559mxge_rem_msix_irqs(mxge_softc_t *sc)
4560{
4561	int i, rid;
4562
4563	for (i = 0; i < sc->num_slices; i++) {
4564		if (sc->msix_ih[i] != NULL) {
4565			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4566					  sc->msix_ih[i]);
4567			sc->msix_ih[i] = NULL;
4568		}
4569	}
4570	free(sc->msix_ih, M_DEVBUF);
4571
4572	for (i = 0; i < sc->num_slices; i++) {
4573		rid = i + 1;
4574		if (sc->msix_irq_res[i] != NULL)
4575			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4576					     sc->msix_irq_res[i]);
4577		sc->msix_irq_res[i] = NULL;
4578	}
4579	free(sc->msix_irq_res, M_DEVBUF);
4580
4581	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4582			     sc->msix_table_res);
4583
4584	pci_release_msi(sc->dev);
4585	return;
4586}
4587
4588static void
4589mxge_rem_single_irq(mxge_softc_t *sc)
4590{
4591	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4592	bus_release_resource(sc->dev, SYS_RES_IRQ,
4593			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4594	if (!sc->legacy_irq)
4595		pci_release_msi(sc->dev);
4596}
4597
4598static void
4599mxge_rem_irq(mxge_softc_t *sc)
4600{
4601	if (sc->num_slices > 1)
4602		mxge_rem_msix_irqs(sc);
4603	else
4604		mxge_rem_single_irq(sc);
4605}
4606
4607static int
4608mxge_add_irq(mxge_softc_t *sc)
4609{
4610	int err;
4611
4612	if (sc->num_slices > 1)
4613		err = mxge_add_msix_irqs(sc);
4614	else
4615		err = mxge_add_single_irq(sc);
4616
4617	if (0 && err == 0 && sc->num_slices > 1) {
4618		mxge_rem_msix_irqs(sc);
4619		err = mxge_add_msix_irqs(sc);
4620	}
4621	return err;
4622}
4623
4624
4625static int
4626mxge_attach(device_t dev)
4627{
4628	mxge_softc_t *sc = device_get_softc(dev);
4629	struct ifnet *ifp;
4630	int err, rid;
4631
4632	sc->dev = dev;
4633	mxge_fetch_tunables(sc);
4634
4635	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4636	sc->tq = taskqueue_create_fast("mxge_taskq", M_WAITOK,
4637				       taskqueue_thread_enqueue,
4638				       &sc->tq);
4639	if (sc->tq == NULL) {
4640		err = ENOMEM;
4641		goto abort_with_nothing;
4642	}
4643
4644	err = bus_dma_tag_create(NULL,			/* parent */
4645				 1,			/* alignment */
4646				 0,			/* boundary */
4647				 BUS_SPACE_MAXADDR,	/* low */
4648				 BUS_SPACE_MAXADDR,	/* high */
4649				 NULL, NULL,		/* filter */
4650				 65536 + 256,		/* maxsize */
4651				 MXGE_MAX_SEND_DESC, 	/* num segs */
4652				 65536,			/* maxsegsize */
4653				 0,			/* flags */
4654				 NULL, NULL,		/* lock */
4655				 &sc->parent_dmat);	/* tag */
4656
4657	if (err != 0) {
4658		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4659			      err);
4660		goto abort_with_tq;
4661	}
4662
4663	ifp = sc->ifp = if_alloc(IFT_ETHER);
4664	if (ifp == NULL) {
4665		device_printf(dev, "can not if_alloc()\n");
4666		err = ENOSPC;
4667		goto abort_with_parent_dmat;
4668	}
4669	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4670
4671	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4672		 device_get_nameunit(dev));
4673	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4674	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4675		 "%s:drv", device_get_nameunit(dev));
4676	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4677		 MTX_NETWORK_LOCK, MTX_DEF);
4678
4679	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4680
4681	mxge_setup_cfg_space(sc);
4682
4683	/* Map the board into the kernel */
4684	rid = PCIR_BARS;
4685	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4686					 ~0, 1, RF_ACTIVE);
4687	if (sc->mem_res == NULL) {
4688		device_printf(dev, "could not map memory\n");
4689		err = ENXIO;
4690		goto abort_with_lock;
4691	}
4692	sc->sram = rman_get_virtual(sc->mem_res);
4693	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4694	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4695		device_printf(dev, "impossible memory region size %ld\n",
4696			      rman_get_size(sc->mem_res));
4697		err = ENXIO;
4698		goto abort_with_mem_res;
4699	}
4700
4701	/* make NULL terminated copy of the EEPROM strings section of
4702	   lanai SRAM */
4703	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4704	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4705				rman_get_bushandle(sc->mem_res),
4706				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4707				sc->eeprom_strings,
4708				MXGE_EEPROM_STRINGS_SIZE - 2);
4709	err = mxge_parse_strings(sc);
4710	if (err != 0)
4711		goto abort_with_mem_res;
4712
4713	/* Enable write combining for efficient use of PCIe bus */
4714	mxge_enable_wc(sc);
4715
4716	/* Allocate the out of band dma memory */
4717	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4718			     sizeof (mxge_cmd_t), 64);
4719	if (err != 0)
4720		goto abort_with_mem_res;
4721	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4722	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4723	if (err != 0)
4724		goto abort_with_cmd_dma;
4725
4726	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4727	if (err != 0)
4728		goto abort_with_zeropad_dma;
4729
4730	/* select & load the firmware */
4731	err = mxge_select_firmware(sc);
4732	if (err != 0)
4733		goto abort_with_dmabench;
4734	sc->intr_coal_delay = mxge_intr_coal_delay;
4735
4736	mxge_slice_probe(sc);
4737	err = mxge_alloc_slices(sc);
4738	if (err != 0)
4739		goto abort_with_dmabench;
4740
4741	err = mxge_reset(sc, 0);
4742	if (err != 0)
4743		goto abort_with_slices;
4744
4745	err = mxge_alloc_rings(sc);
4746	if (err != 0) {
4747		device_printf(sc->dev, "failed to allocate rings\n");
4748		goto abort_with_slices;
4749	}
4750
4751	err = mxge_add_irq(sc);
4752	if (err != 0) {
4753		device_printf(sc->dev, "failed to add irq\n");
4754		goto abort_with_rings;
4755	}
4756
4757	ifp->if_baudrate = IF_Gbps(10UL);
4758	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4759		IFCAP_VLAN_MTU | IFCAP_LINKSTATE;
4760#ifdef INET
4761	ifp->if_capabilities |= IFCAP_LRO;
4762#endif
4763
4764#ifdef MXGE_NEW_VLAN_API
4765	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4766
4767	/* Only FW 1.4.32 and newer can do TSO over vlans */
4768	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4769	    sc->fw_ver_tiny >= 32)
4770		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4771#endif
4772
4773	sc->max_mtu = mxge_max_mtu(sc);
4774	if (sc->max_mtu >= 9000)
4775		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4776	else
4777		device_printf(dev, "MTU limited to %d.  Install "
4778			      "latest firmware for 9000 byte jumbo support\n",
4779			      sc->max_mtu - ETHER_HDR_LEN);
4780	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4781	ifp->if_capenable = ifp->if_capabilities;
4782	if (sc->lro_cnt == 0)
4783		ifp->if_capenable &= ~IFCAP_LRO;
4784	sc->csum_flag = 1;
4785        ifp->if_init = mxge_init;
4786        ifp->if_softc = sc;
4787        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4788        ifp->if_ioctl = mxge_ioctl;
4789        ifp->if_start = mxge_start;
4790	/* Initialise the ifmedia structure */
4791	ifmedia_init(&sc->media, 0, mxge_media_change,
4792		     mxge_media_status);
4793	mxge_media_init(sc);
4794	mxge_media_probe(sc);
4795	sc->dying = 0;
4796	ether_ifattach(ifp, sc->mac_addr);
4797	/* ether_ifattach sets mtu to ETHERMTU */
4798	if (mxge_initial_mtu != ETHERMTU)
4799		mxge_change_mtu(sc, mxge_initial_mtu);
4800
4801	mxge_add_sysctls(sc);
4802#ifdef IFNET_BUF_RING
4803	ifp->if_transmit = mxge_transmit;
4804	ifp->if_qflush = mxge_qflush;
4805#endif
4806	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4807				device_get_nameunit(sc->dev));
4808	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4809	return 0;
4810
4811abort_with_rings:
4812	mxge_free_rings(sc);
4813abort_with_slices:
4814	mxge_free_slices(sc);
4815abort_with_dmabench:
4816	mxge_dma_free(&sc->dmabench_dma);
4817abort_with_zeropad_dma:
4818	mxge_dma_free(&sc->zeropad_dma);
4819abort_with_cmd_dma:
4820	mxge_dma_free(&sc->cmd_dma);
4821abort_with_mem_res:
4822	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4823abort_with_lock:
4824	pci_disable_busmaster(dev);
4825	mtx_destroy(&sc->cmd_mtx);
4826	mtx_destroy(&sc->driver_mtx);
4827	if_free(ifp);
4828abort_with_parent_dmat:
4829	bus_dma_tag_destroy(sc->parent_dmat);
4830abort_with_tq:
4831	if (sc->tq != NULL) {
4832		taskqueue_drain(sc->tq, &sc->watchdog_task);
4833		taskqueue_free(sc->tq);
4834		sc->tq = NULL;
4835	}
4836abort_with_nothing:
4837	return err;
4838}
4839
4840static int
4841mxge_detach(device_t dev)
4842{
4843	mxge_softc_t *sc = device_get_softc(dev);
4844
4845	if (mxge_vlans_active(sc)) {
4846		device_printf(sc->dev,
4847			      "Detach vlans before removing module\n");
4848		return EBUSY;
4849	}
4850	mtx_lock(&sc->driver_mtx);
4851	sc->dying = 1;
4852	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4853		mxge_close(sc, 0);
4854	mtx_unlock(&sc->driver_mtx);
4855	ether_ifdetach(sc->ifp);
4856	if (sc->tq != NULL) {
4857		taskqueue_drain(sc->tq, &sc->watchdog_task);
4858		taskqueue_free(sc->tq);
4859		sc->tq = NULL;
4860	}
4861	callout_drain(&sc->co_hdl);
4862	ifmedia_removeall(&sc->media);
4863	mxge_dummy_rdma(sc, 0);
4864	mxge_rem_sysctls(sc);
4865	mxge_rem_irq(sc);
4866	mxge_free_rings(sc);
4867	mxge_free_slices(sc);
4868	mxge_dma_free(&sc->dmabench_dma);
4869	mxge_dma_free(&sc->zeropad_dma);
4870	mxge_dma_free(&sc->cmd_dma);
4871	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4872	pci_disable_busmaster(dev);
4873	mtx_destroy(&sc->cmd_mtx);
4874	mtx_destroy(&sc->driver_mtx);
4875	if_free(sc->ifp);
4876	bus_dma_tag_destroy(sc->parent_dmat);
4877	return 0;
4878}
4879
4880static int
4881mxge_shutdown(device_t dev)
4882{
4883	return 0;
4884}
4885
4886/*
4887  This file uses Myri10GE driver indentation.
4888
4889  Local Variables:
4890  c-file-style:"linux"
4891  tab-width:8
4892  End:
4893*/
4894