if_mxge.c revision 229272
1/******************************************************************************
2
3Copyright (c) 2006-2009, Myricom Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Myricom Inc, nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/mxge/if_mxge.c 229272 2012-01-02 12:12:10Z ed $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/linker.h>
36#include <sys/firmware.h>
37#include <sys/endian.h>
38#include <sys/sockio.h>
39#include <sys/mbuf.h>
40#include <sys/malloc.h>
41#include <sys/kdb.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/module.h>
45#include <sys/socket.h>
46#include <sys/sysctl.h>
47#include <sys/sx.h>
48#include <sys/taskqueue.h>
49
50/* count xmits ourselves, rather than via drbr */
51#define NO_SLOW_STATS
52#include <net/if.h>
53#include <net/if_arp.h>
54#include <net/ethernet.h>
55#include <net/if_dl.h>
56#include <net/if_media.h>
57
58#include <net/bpf.h>
59
60#include <net/if_types.h>
61#include <net/if_vlan_var.h>
62#include <net/zlib.h>
63
64#include <netinet/in_systm.h>
65#include <netinet/in.h>
66#include <netinet/ip.h>
67#include <netinet/tcp.h>
68
69#include <machine/bus.h>
70#include <machine/in_cksum.h>
71#include <machine/resource.h>
72#include <sys/bus.h>
73#include <sys/rman.h>
74#include <sys/smp.h>
75
76#include <dev/pci/pcireg.h>
77#include <dev/pci/pcivar.h>
78#include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
79
80#include <vm/vm.h>		/* for pmap_mapdev() */
81#include <vm/pmap.h>
82
83#if defined(__i386) || defined(__amd64)
84#include <machine/specialreg.h>
85#endif
86
87#include <dev/mxge/mxge_mcp.h>
88#include <dev/mxge/mcp_gen_header.h>
89/*#define MXGE_FAKE_IFP*/
90#include <dev/mxge/if_mxge_var.h>
91#ifdef IFNET_BUF_RING
92#include <sys/buf_ring.h>
93#endif
94
95#include "opt_inet.h"
96
97/* tunable params */
98static int mxge_nvidia_ecrc_enable = 1;
99static int mxge_force_firmware = 0;
100static int mxge_intr_coal_delay = 30;
101static int mxge_deassert_wait = 1;
102static int mxge_flow_control = 1;
103static int mxge_verbose = 0;
104static int mxge_lro_cnt = 8;
105static int mxge_ticks;
106static int mxge_max_slices = 1;
107static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
108static int mxge_always_promisc = 0;
109static int mxge_initial_mtu = ETHERMTU_JUMBO;
110static int mxge_throttle = 0;
111static char *mxge_fw_unaligned = "mxge_ethp_z8e";
112static char *mxge_fw_aligned = "mxge_eth_z8e";
113static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
114static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
115
116static int mxge_probe(device_t dev);
117static int mxge_attach(device_t dev);
118static int mxge_detach(device_t dev);
119static int mxge_shutdown(device_t dev);
120static void mxge_intr(void *arg);
121
122static device_method_t mxge_methods[] =
123{
124  /* Device interface */
125  DEVMETHOD(device_probe, mxge_probe),
126  DEVMETHOD(device_attach, mxge_attach),
127  DEVMETHOD(device_detach, mxge_detach),
128  DEVMETHOD(device_shutdown, mxge_shutdown),
129  {0, 0}
130};
131
132static driver_t mxge_driver =
133{
134  "mxge",
135  mxge_methods,
136  sizeof(mxge_softc_t),
137};
138
139static devclass_t mxge_devclass;
140
141/* Declare ourselves to be a child of the PCI bus.*/
142DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
143MODULE_DEPEND(mxge, firmware, 1, 1, 1);
144MODULE_DEPEND(mxge, zlib, 1, 1, 1);
145
146static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
147static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
148static int mxge_close(mxge_softc_t *sc, int down);
149static int mxge_open(mxge_softc_t *sc);
150static void mxge_tick(void *arg);
151
152static int
153mxge_probe(device_t dev)
154{
155	int rev;
156
157
158	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
159	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
160	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
161		rev = pci_get_revid(dev);
162		switch (rev) {
163		case MXGE_PCI_REV_Z8E:
164			device_set_desc(dev, "Myri10G-PCIE-8A");
165			break;
166		case MXGE_PCI_REV_Z8ES:
167			device_set_desc(dev, "Myri10G-PCIE-8B");
168			break;
169		default:
170			device_set_desc(dev, "Myri10G-PCIE-8??");
171			device_printf(dev, "Unrecognized rev %d NIC\n",
172				      rev);
173			break;
174		}
175		return 0;
176	}
177	return ENXIO;
178}
179
180static void
181mxge_enable_wc(mxge_softc_t *sc)
182{
183#if defined(__i386) || defined(__amd64)
184	vm_offset_t len;
185	int err;
186
187	sc->wc = 1;
188	len = rman_get_size(sc->mem_res);
189	err = pmap_change_attr((vm_offset_t) sc->sram,
190			       len, PAT_WRITE_COMBINING);
191	if (err != 0) {
192		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193			      err);
194		sc->wc = 0;
195	}
196#endif
197}
198
199
200/* callback to get our DMA address */
201static void
202mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
203			 int error)
204{
205	if (error == 0) {
206		*(bus_addr_t *) arg = segs->ds_addr;
207	}
208}
209
210static int
211mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
212		   bus_size_t alignment)
213{
214	int err;
215	device_t dev = sc->dev;
216	bus_size_t boundary, maxsegsize;
217
218	if (bytes > 4096 && alignment == 4096) {
219		boundary = 0;
220		maxsegsize = bytes;
221	} else {
222		boundary = 4096;
223		maxsegsize = 4096;
224	}
225
226	/* allocate DMAable memory tags */
227	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
228				 alignment,		/* alignment */
229				 boundary,		/* boundary */
230				 BUS_SPACE_MAXADDR,	/* low */
231				 BUS_SPACE_MAXADDR,	/* high */
232				 NULL, NULL,		/* filter */
233				 bytes,			/* maxsize */
234				 1,			/* num segs */
235				 maxsegsize,		/* maxsegsize */
236				 BUS_DMA_COHERENT,	/* flags */
237				 NULL, NULL,		/* lock */
238				 &dma->dmat);		/* tag */
239	if (err != 0) {
240		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
241		return err;
242	}
243
244	/* allocate DMAable memory & map */
245	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
246			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
247				| BUS_DMA_ZERO),  &dma->map);
248	if (err != 0) {
249		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
250		goto abort_with_dmat;
251	}
252
253	/* load the memory */
254	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
255			      mxge_dmamap_callback,
256			      (void *)&dma->bus_addr, 0);
257	if (err != 0) {
258		device_printf(dev, "couldn't load map (err = %d)\n", err);
259		goto abort_with_mem;
260	}
261	return 0;
262
263abort_with_mem:
264	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
265abort_with_dmat:
266	(void)bus_dma_tag_destroy(dma->dmat);
267	return err;
268}
269
270
271static void
272mxge_dma_free(mxge_dma_t *dma)
273{
274	bus_dmamap_unload(dma->dmat, dma->map);
275	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
276	(void)bus_dma_tag_destroy(dma->dmat);
277}
278
279/*
280 * The eeprom strings on the lanaiX have the format
281 * SN=x\0
282 * MAC=x:x:x:x:x:x\0
283 * PC=text\0
284 */
285
286static int
287mxge_parse_strings(mxge_softc_t *sc)
288{
289#define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
290
291	char *ptr, *limit;
292	int i, found_mac;
293
294	ptr = sc->eeprom_strings;
295	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
296	found_mac = 0;
297	while (ptr < limit && *ptr != '\0') {
298		if (memcmp(ptr, "MAC=", 4) == 0) {
299			ptr += 1;
300			sc->mac_addr_string = ptr;
301			for (i = 0; i < 6; i++) {
302				ptr += 3;
303				if ((ptr + 2) > limit)
304					goto abort;
305				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
306				found_mac = 1;
307			}
308		} else if (memcmp(ptr, "PC=", 3) == 0) {
309			ptr += 3;
310			strncpy(sc->product_code_string, ptr,
311				sizeof (sc->product_code_string) - 1);
312		} else if (memcmp(ptr, "SN=", 3) == 0) {
313			ptr += 3;
314			strncpy(sc->serial_number_string, ptr,
315				sizeof (sc->serial_number_string) - 1);
316		}
317		MXGE_NEXT_STRING(ptr);
318	}
319
320	if (found_mac)
321		return 0;
322
323 abort:
324	device_printf(sc->dev, "failed to parse eeprom_strings\n");
325
326	return ENXIO;
327}
328
329#if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
330static void
331mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
332{
333	uint32_t val;
334	unsigned long base, off;
335	char *va, *cfgptr;
336	device_t pdev, mcp55;
337	uint16_t vendor_id, device_id, word;
338	uintptr_t bus, slot, func, ivend, idev;
339	uint32_t *ptr32;
340
341
342	if (!mxge_nvidia_ecrc_enable)
343		return;
344
345	pdev = device_get_parent(device_get_parent(sc->dev));
346	if (pdev == NULL) {
347		device_printf(sc->dev, "could not find parent?\n");
348		return;
349	}
350	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
351	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
352
353	if (vendor_id != 0x10de)
354		return;
355
356	base = 0;
357
358	if (device_id == 0x005d) {
359		/* ck804, base address is magic */
360		base = 0xe0000000UL;
361	} else if (device_id >= 0x0374 && device_id <= 0x378) {
362		/* mcp55, base address stored in chipset */
363		mcp55 = pci_find_bsf(0, 0, 0);
364		if (mcp55 &&
365		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
366		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
367			word = pci_read_config(mcp55, 0x90, 2);
368			base = ((unsigned long)word & 0x7ffeU) << 25;
369		}
370	}
371	if (!base)
372		return;
373
374	/* XXXX
375	   Test below is commented because it is believed that doing
376	   config read/write beyond 0xff will access the config space
377	   for the next larger function.  Uncomment this and remove
378	   the hacky pmap_mapdev() way of accessing config space when
379	   FreeBSD grows support for extended pcie config space access
380	*/
381#if 0
382	/* See if we can, by some miracle, access the extended
383	   config space */
384	val = pci_read_config(pdev, 0x178, 4);
385	if (val != 0xffffffff) {
386		val |= 0x40;
387		pci_write_config(pdev, 0x178, val, 4);
388		return;
389	}
390#endif
391	/* Rather than using normal pci config space writes, we must
392	 * map the Nvidia config space ourselves.  This is because on
393	 * opteron/nvidia class machine the 0xe000000 mapping is
394	 * handled by the nvidia chipset, that means the internal PCI
395	 * device (the on-chip northbridge), or the amd-8131 bridge
396	 * and things behind them are not visible by this method.
397	 */
398
399	BUS_READ_IVAR(device_get_parent(pdev), pdev,
400		      PCI_IVAR_BUS, &bus);
401	BUS_READ_IVAR(device_get_parent(pdev), pdev,
402		      PCI_IVAR_SLOT, &slot);
403	BUS_READ_IVAR(device_get_parent(pdev), pdev,
404		      PCI_IVAR_FUNCTION, &func);
405	BUS_READ_IVAR(device_get_parent(pdev), pdev,
406		      PCI_IVAR_VENDOR, &ivend);
407	BUS_READ_IVAR(device_get_parent(pdev), pdev,
408		      PCI_IVAR_DEVICE, &idev);
409
410	off =  base
411		+ 0x00100000UL * (unsigned long)bus
412		+ 0x00001000UL * (unsigned long)(func
413						 + 8 * slot);
414
415	/* map it into the kernel */
416	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
417
418
419	if (va == NULL) {
420		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
421		return;
422	}
423	/* get a pointer to the config space mapped into the kernel */
424	cfgptr = va + (off & PAGE_MASK);
425
426	/* make sure that we can really access it */
427	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
428	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
429	if (! (vendor_id == ivend && device_id == idev)) {
430		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
431			      vendor_id, device_id);
432		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
433		return;
434	}
435
436	ptr32 = (uint32_t*)(cfgptr + 0x178);
437	val = *ptr32;
438
439	if (val == 0xffffffff) {
440		device_printf(sc->dev, "extended mapping failed\n");
441		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
442		return;
443	}
444	*ptr32 = val | 0x40;
445	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
446	if (mxge_verbose)
447		device_printf(sc->dev,
448			      "Enabled ECRC on upstream Nvidia bridge "
449			      "at %d:%d:%d\n",
450			      (int)bus, (int)slot, (int)func);
451	return;
452}
453#else
454static void
455mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
456{
457	device_printf(sc->dev,
458		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
459	return;
460}
461#endif
462
463
464static int
465mxge_dma_test(mxge_softc_t *sc, int test_type)
466{
467	mxge_cmd_t cmd;
468	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
469	int status;
470	uint32_t len;
471	char *test = " ";
472
473
474	/* Run a small DMA test.
475	 * The magic multipliers to the length tell the firmware
476	 * to do DMA read, write, or read+write tests.  The
477	 * results are returned in cmd.data0.  The upper 16
478	 * bits of the return is the number of transfers completed.
479	 * The lower 16 bits is the time in 0.5us ticks that the
480	 * transfers took to complete.
481	 */
482
483	len = sc->tx_boundary;
484
485	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
486	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
487	cmd.data2 = len * 0x10000;
488	status = mxge_send_cmd(sc, test_type, &cmd);
489	if (status != 0) {
490		test = "read";
491		goto abort;
492	}
493	sc->read_dma = ((cmd.data0>>16) * len * 2) /
494		(cmd.data0 & 0xffff);
495	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
496	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
497	cmd.data2 = len * 0x1;
498	status = mxge_send_cmd(sc, test_type, &cmd);
499	if (status != 0) {
500		test = "write";
501		goto abort;
502	}
503	sc->write_dma = ((cmd.data0>>16) * len * 2) /
504		(cmd.data0 & 0xffff);
505
506	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508	cmd.data2 = len * 0x10001;
509	status = mxge_send_cmd(sc, test_type, &cmd);
510	if (status != 0) {
511		test = "read/write";
512		goto abort;
513	}
514	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
515		(cmd.data0 & 0xffff);
516
517abort:
518	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
519		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
520			      test, status);
521
522	return status;
523}
524
525/*
526 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
527 * when the PCI-E Completion packets are aligned on an 8-byte
528 * boundary.  Some PCI-E chip sets always align Completion packets; on
529 * the ones that do not, the alignment can be enforced by enabling
530 * ECRC generation (if supported).
531 *
532 * When PCI-E Completion packets are not aligned, it is actually more
533 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
534 *
535 * If the driver can neither enable ECRC nor verify that it has
536 * already been enabled, then it must use a firmware image which works
537 * around unaligned completion packets (ethp_z8e.dat), and it should
538 * also ensure that it never gives the device a Read-DMA which is
539 * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
540 * enabled, then the driver should use the aligned (eth_z8e.dat)
541 * firmware image, and set tx_boundary to 4KB.
542 */
543
544static int
545mxge_firmware_probe(mxge_softc_t *sc)
546{
547	device_t dev = sc->dev;
548	int reg, status;
549	uint16_t pectl;
550
551	sc->tx_boundary = 4096;
552	/*
553	 * Verify the max read request size was set to 4KB
554	 * before trying the test with 4KB.
555	 */
556	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
557		pectl = pci_read_config(dev, reg + 0x8, 2);
558		if ((pectl & (5 << 12)) != (5 << 12)) {
559			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
560				      pectl);
561			sc->tx_boundary = 2048;
562		}
563	}
564
565	/*
566	 * load the optimized firmware (which assumes aligned PCIe
567	 * completions) in order to see if it works on this host.
568	 */
569	sc->fw_name = mxge_fw_aligned;
570	status = mxge_load_firmware(sc, 1);
571	if (status != 0) {
572		return status;
573	}
574
575	/*
576	 * Enable ECRC if possible
577	 */
578	mxge_enable_nvidia_ecrc(sc);
579
580	/*
581	 * Run a DMA test which watches for unaligned completions and
582	 * aborts on the first one seen.
583	 */
584
585	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
586	if (status == 0)
587		return 0; /* keep the aligned firmware */
588
589	if (status != E2BIG)
590		device_printf(dev, "DMA test failed: %d\n", status);
591	if (status == ENOSYS)
592		device_printf(dev, "Falling back to ethp! "
593			      "Please install up to date fw\n");
594	return status;
595}
596
597static int
598mxge_select_firmware(mxge_softc_t *sc)
599{
600	int aligned = 0;
601	int force_firmware = mxge_force_firmware;
602
603	if (sc->throttle)
604		force_firmware = sc->throttle;
605
606	if (force_firmware != 0) {
607		if (force_firmware == 1)
608			aligned = 1;
609		else
610			aligned = 0;
611		if (mxge_verbose)
612			device_printf(sc->dev,
613				      "Assuming %s completions (forced)\n",
614				      aligned ? "aligned" : "unaligned");
615		goto abort;
616	}
617
618	/* if the PCIe link width is 4 or less, we can use the aligned
619	   firmware and skip any checks */
620	if (sc->link_width != 0 && sc->link_width <= 4) {
621		device_printf(sc->dev,
622			      "PCIe x%d Link, expect reduced performance\n",
623			      sc->link_width);
624		aligned = 1;
625		goto abort;
626	}
627
628	if (0 == mxge_firmware_probe(sc))
629		return 0;
630
631abort:
632	if (aligned) {
633		sc->fw_name = mxge_fw_aligned;
634		sc->tx_boundary = 4096;
635	} else {
636		sc->fw_name = mxge_fw_unaligned;
637		sc->tx_boundary = 2048;
638	}
639	return (mxge_load_firmware(sc, 0));
640}
641
642union qualhack
643{
644        const char *ro_char;
645        char *rw_char;
646};
647
648static int
649mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
650{
651
652
653	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
654		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
655			      be32toh(hdr->mcp_type));
656		return EIO;
657	}
658
659	/* save firmware version for sysctl */
660	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
661	if (mxge_verbose)
662		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
663
664	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
665	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
666
667	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
668	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
669		device_printf(sc->dev, "Found firmware version %s\n",
670			      sc->fw_version);
671		device_printf(sc->dev, "Driver needs %d.%d\n",
672			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
673		return EINVAL;
674	}
675	return 0;
676
677}
678
679static void *
680z_alloc(void *nil, u_int items, u_int size)
681{
682        void *ptr;
683
684        ptr = malloc(items * size, M_TEMP, M_NOWAIT);
685        return ptr;
686}
687
688static void
689z_free(void *nil, void *ptr)
690{
691        free(ptr, M_TEMP);
692}
693
694
695static int
696mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
697{
698	z_stream zs;
699	char *inflate_buffer;
700	const struct firmware *fw;
701	const mcp_gen_header_t *hdr;
702	unsigned hdr_offset;
703	int status;
704	unsigned int i;
705	char dummy;
706	size_t fw_len;
707
708	fw = firmware_get(sc->fw_name);
709	if (fw == NULL) {
710		device_printf(sc->dev, "Could not find firmware image %s\n",
711			      sc->fw_name);
712		return ENOENT;
713	}
714
715
716
717	/* setup zlib and decompress f/w */
718	bzero(&zs, sizeof (zs));
719	zs.zalloc = z_alloc;
720	zs.zfree = z_free;
721	status = inflateInit(&zs);
722	if (status != Z_OK) {
723		status = EIO;
724		goto abort_with_fw;
725	}
726
727	/* the uncompressed size is stored as the firmware version,
728	   which would otherwise go unused */
729	fw_len = (size_t) fw->version;
730	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
731	if (inflate_buffer == NULL)
732		goto abort_with_zs;
733	zs.avail_in = fw->datasize;
734	zs.next_in = __DECONST(char *, fw->data);
735	zs.avail_out = fw_len;
736	zs.next_out = inflate_buffer;
737	status = inflate(&zs, Z_FINISH);
738	if (status != Z_STREAM_END) {
739		device_printf(sc->dev, "zlib %d\n", status);
740		status = EIO;
741		goto abort_with_buffer;
742	}
743
744	/* check id */
745	hdr_offset = htobe32(*(const uint32_t *)
746			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
747	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
748		device_printf(sc->dev, "Bad firmware file");
749		status = EIO;
750		goto abort_with_buffer;
751	}
752	hdr = (const void*)(inflate_buffer + hdr_offset);
753
754	status = mxge_validate_firmware(sc, hdr);
755	if (status != 0)
756		goto abort_with_buffer;
757
758	/* Copy the inflated firmware to NIC SRAM. */
759	for (i = 0; i < fw_len; i += 256) {
760		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
761			      inflate_buffer + i,
762			      min(256U, (unsigned)(fw_len - i)));
763		wmb();
764		dummy = *sc->sram;
765		wmb();
766	}
767
768	*limit = fw_len;
769	status = 0;
770abort_with_buffer:
771	free(inflate_buffer, M_TEMP);
772abort_with_zs:
773	inflateEnd(&zs);
774abort_with_fw:
775	firmware_put(fw, FIRMWARE_UNLOAD);
776	return status;
777}
778
779/*
780 * Enable or disable periodic RDMAs from the host to make certain
781 * chipsets resend dropped PCIe messages
782 */
783
784static void
785mxge_dummy_rdma(mxge_softc_t *sc, int enable)
786{
787	char buf_bytes[72];
788	volatile uint32_t *confirm;
789	volatile char *submit;
790	uint32_t *buf, dma_low, dma_high;
791	int i;
792
793	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
794
795	/* clear confirmation addr */
796	confirm = (volatile uint32_t *)sc->cmd;
797	*confirm = 0;
798	wmb();
799
800	/* send an rdma command to the PCIe engine, and wait for the
801	   response in the confirmation address.  The firmware should
802	   write a -1 there to indicate it is alive and well
803	*/
804
805	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
806	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
807	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
808	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
809	buf[2] = htobe32(0xffffffff);		/* confirm data */
810	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
811	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
812	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
813	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
814	buf[5] = htobe32(enable);			/* enable? */
815
816
817	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
818
819	mxge_pio_copy(submit, buf, 64);
820	wmb();
821	DELAY(1000);
822	wmb();
823	i = 0;
824	while (*confirm != 0xffffffff && i < 20) {
825		DELAY(1000);
826		i++;
827	}
828	if (*confirm != 0xffffffff) {
829		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
830			      (enable ? "enable" : "disable"), confirm,
831			      *confirm);
832	}
833	return;
834}
835
836static int
837mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
838{
839	mcp_cmd_t *buf;
840	char buf_bytes[sizeof(*buf) + 8];
841	volatile mcp_cmd_response_t *response = sc->cmd;
842	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
843	uint32_t dma_low, dma_high;
844	int err, sleep_total = 0;
845
846	/* ensure buf is aligned to 8 bytes */
847	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
848
849	buf->data0 = htobe32(data->data0);
850	buf->data1 = htobe32(data->data1);
851	buf->data2 = htobe32(data->data2);
852	buf->cmd = htobe32(cmd);
853	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
854	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
855
856	buf->response_addr.low = htobe32(dma_low);
857	buf->response_addr.high = htobe32(dma_high);
858	mtx_lock(&sc->cmd_mtx);
859	response->result = 0xffffffff;
860	wmb();
861	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
862
863	/* wait up to 20ms */
864	err = EAGAIN;
865	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
866		bus_dmamap_sync(sc->cmd_dma.dmat,
867				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
868		wmb();
869		switch (be32toh(response->result)) {
870		case 0:
871			data->data0 = be32toh(response->data);
872			err = 0;
873			break;
874		case 0xffffffff:
875			DELAY(1000);
876			break;
877		case MXGEFW_CMD_UNKNOWN:
878			err = ENOSYS;
879			break;
880		case MXGEFW_CMD_ERROR_UNALIGNED:
881			err = E2BIG;
882			break;
883		case MXGEFW_CMD_ERROR_BUSY:
884			err = EBUSY;
885			break;
886		case MXGEFW_CMD_ERROR_I2C_ABSENT:
887			err = ENXIO;
888			break;
889		default:
890			device_printf(sc->dev,
891				      "mxge: command %d "
892				      "failed, result = %d\n",
893				      cmd, be32toh(response->result));
894			err = ENXIO;
895			break;
896		}
897		if (err != EAGAIN)
898			break;
899	}
900	if (err == EAGAIN)
901		device_printf(sc->dev, "mxge: command %d timed out"
902			      "result = %d\n",
903			      cmd, be32toh(response->result));
904	mtx_unlock(&sc->cmd_mtx);
905	return err;
906}
907
908static int
909mxge_adopt_running_firmware(mxge_softc_t *sc)
910{
911	struct mcp_gen_header *hdr;
912	const size_t bytes = sizeof (struct mcp_gen_header);
913	size_t hdr_offset;
914	int status;
915
916	/* find running firmware header */
917	hdr_offset = htobe32(*(volatile uint32_t *)
918			     (sc->sram + MCP_HEADER_PTR_OFFSET));
919
920	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
921		device_printf(sc->dev,
922			      "Running firmware has bad header offset (%d)\n",
923			      (int)hdr_offset);
924		return EIO;
925	}
926
927	/* copy header of running firmware from SRAM to host memory to
928	 * validate firmware */
929	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
930	if (hdr == NULL) {
931		device_printf(sc->dev, "could not malloc firmware hdr\n");
932		return ENOMEM;
933	}
934	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
935				rman_get_bushandle(sc->mem_res),
936				hdr_offset, (char *)hdr, bytes);
937	status = mxge_validate_firmware(sc, hdr);
938	free(hdr, M_DEVBUF);
939
940	/*
941	 * check to see if adopted firmware has bug where adopting
942	 * it will cause broadcasts to be filtered unless the NIC
943	 * is kept in ALLMULTI mode
944	 */
945	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
946	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
947		sc->adopted_rx_filter_bug = 1;
948		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
949			      "working around rx filter bug\n",
950			      sc->fw_ver_major, sc->fw_ver_minor,
951			      sc->fw_ver_tiny);
952	}
953
954	return status;
955}
956
957
958static int
959mxge_load_firmware(mxge_softc_t *sc, int adopt)
960{
961	volatile uint32_t *confirm;
962	volatile char *submit;
963	char buf_bytes[72];
964	uint32_t *buf, size, dma_low, dma_high;
965	int status, i;
966
967	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
968
969	size = sc->sram_size;
970	status = mxge_load_firmware_helper(sc, &size);
971	if (status) {
972		if (!adopt)
973			return status;
974		/* Try to use the currently running firmware, if
975		   it is new enough */
976		status = mxge_adopt_running_firmware(sc);
977		if (status) {
978			device_printf(sc->dev,
979				      "failed to adopt running firmware\n");
980			return status;
981		}
982		device_printf(sc->dev,
983			      "Successfully adopted running firmware\n");
984		if (sc->tx_boundary == 4096) {
985			device_printf(sc->dev,
986				"Using firmware currently running on NIC"
987				 ".  For optimal\n");
988			device_printf(sc->dev,
989				 "performance consider loading optimized "
990				 "firmware\n");
991		}
992		sc->fw_name = mxge_fw_unaligned;
993		sc->tx_boundary = 2048;
994		return 0;
995	}
996	/* clear confirmation addr */
997	confirm = (volatile uint32_t *)sc->cmd;
998	*confirm = 0;
999	wmb();
1000	/* send a reload command to the bootstrap MCP, and wait for the
1001	   response in the confirmation address.  The firmware should
1002	   write a -1 there to indicate it is alive and well
1003	*/
1004
1005	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1006	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1007
1008	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1009	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1010	buf[2] = htobe32(0xffffffff);	/* confirm data */
1011
1012	/* FIX: All newest firmware should un-protect the bottom of
1013	   the sram before handoff. However, the very first interfaces
1014	   do not. Therefore the handoff copy must skip the first 8 bytes
1015	*/
1016					/* where the code starts*/
1017	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1018	buf[4] = htobe32(size - 8); 	/* length of code */
1019	buf[5] = htobe32(8);		/* where to copy to */
1020	buf[6] = htobe32(0);		/* where to jump to */
1021
1022	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1023	mxge_pio_copy(submit, buf, 64);
1024	wmb();
1025	DELAY(1000);
1026	wmb();
1027	i = 0;
1028	while (*confirm != 0xffffffff && i < 20) {
1029		DELAY(1000*10);
1030		i++;
1031		bus_dmamap_sync(sc->cmd_dma.dmat,
1032				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1033	}
1034	if (*confirm != 0xffffffff) {
1035		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1036			confirm, *confirm);
1037
1038		return ENXIO;
1039	}
1040	return 0;
1041}
1042
1043static int
1044mxge_update_mac_address(mxge_softc_t *sc)
1045{
1046	mxge_cmd_t cmd;
1047	uint8_t *addr = sc->mac_addr;
1048	int status;
1049
1050
1051	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1052		     | (addr[2] << 8) | addr[3]);
1053
1054	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1055
1056	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1057	return status;
1058}
1059
1060static int
1061mxge_change_pause(mxge_softc_t *sc, int pause)
1062{
1063	mxge_cmd_t cmd;
1064	int status;
1065
1066	if (pause)
1067		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1068				       &cmd);
1069	else
1070		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1071				       &cmd);
1072
1073	if (status) {
1074		device_printf(sc->dev, "Failed to set flow control mode\n");
1075		return ENXIO;
1076	}
1077	sc->pause = pause;
1078	return 0;
1079}
1080
1081static void
1082mxge_change_promisc(mxge_softc_t *sc, int promisc)
1083{
1084	mxge_cmd_t cmd;
1085	int status;
1086
1087	if (mxge_always_promisc)
1088		promisc = 1;
1089
1090	if (promisc)
1091		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1092				       &cmd);
1093	else
1094		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1095				       &cmd);
1096
1097	if (status) {
1098		device_printf(sc->dev, "Failed to set promisc mode\n");
1099	}
1100}
1101
1102static void
1103mxge_set_multicast_list(mxge_softc_t *sc)
1104{
1105	mxge_cmd_t cmd;
1106	struct ifmultiaddr *ifma;
1107	struct ifnet *ifp = sc->ifp;
1108	int err;
1109
1110	/* This firmware is known to not support multicast */
1111	if (!sc->fw_multicast_support)
1112		return;
1113
1114	/* Disable multicast filtering while we play with the lists*/
1115	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1116	if (err != 0) {
1117		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1118		       " error status: %d\n", err);
1119		return;
1120	}
1121
1122	if (sc->adopted_rx_filter_bug)
1123		return;
1124
1125	if (ifp->if_flags & IFF_ALLMULTI)
1126		/* request to disable multicast filtering, so quit here */
1127		return;
1128
1129	/* Flush all the filters */
1130
1131	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1132	if (err != 0) {
1133		device_printf(sc->dev,
1134			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1135			      ", error status: %d\n", err);
1136		return;
1137	}
1138
1139	/* Walk the multicast list, and add each address */
1140
1141	if_maddr_rlock(ifp);
1142	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1143		if (ifma->ifma_addr->sa_family != AF_LINK)
1144			continue;
1145		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1146		      &cmd.data0, 4);
1147		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1148		      &cmd.data1, 2);
1149		cmd.data0 = htonl(cmd.data0);
1150		cmd.data1 = htonl(cmd.data1);
1151		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1152		if (err != 0) {
1153			device_printf(sc->dev, "Failed "
1154			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1155			       "%d\t", err);
1156			/* abort, leaving multicast filtering off */
1157			if_maddr_runlock(ifp);
1158			return;
1159		}
1160	}
1161	if_maddr_runlock(ifp);
1162	/* Enable multicast filtering */
1163	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1164	if (err != 0) {
1165		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1166		       ", error status: %d\n", err);
1167	}
1168}
1169
1170static int
1171mxge_max_mtu(mxge_softc_t *sc)
1172{
1173	mxge_cmd_t cmd;
1174	int status;
1175
1176	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1177		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1178
1179	/* try to set nbufs to see if it we can
1180	   use virtually contiguous jumbos */
1181	cmd.data0 = 0;
1182	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1183			       &cmd);
1184	if (status == 0)
1185		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1186
1187	/* otherwise, we're limited to MJUMPAGESIZE */
1188	return MJUMPAGESIZE - MXGEFW_PAD;
1189}
1190
1191static int
1192mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1193{
1194	struct mxge_slice_state *ss;
1195	mxge_rx_done_t *rx_done;
1196	volatile uint32_t *irq_claim;
1197	mxge_cmd_t cmd;
1198	int slice, status;
1199
1200	/* try to send a reset command to the card to see if it
1201	   is alive */
1202	memset(&cmd, 0, sizeof (cmd));
1203	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1204	if (status != 0) {
1205		device_printf(sc->dev, "failed reset\n");
1206		return ENXIO;
1207	}
1208
1209	mxge_dummy_rdma(sc, 1);
1210
1211
1212	/* set the intrq size */
1213	cmd.data0 = sc->rx_ring_size;
1214	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1215
1216	/*
1217	 * Even though we already know how many slices are supported
1218	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1219	 * has magic side effects, and must be called after a reset.
1220	 * It must be called prior to calling any RSS related cmds,
1221	 * including assigning an interrupt queue for anything but
1222	 * slice 0.  It must also be called *after*
1223	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1224	 * the firmware to compute offsets.
1225	 */
1226
1227	if (sc->num_slices > 1) {
1228		/* ask the maximum number of slices it supports */
1229		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1230					   &cmd);
1231		if (status != 0) {
1232			device_printf(sc->dev,
1233				      "failed to get number of slices\n");
1234			return status;
1235		}
1236		/*
1237		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1238		 * to setting up the interrupt queue DMA
1239		 */
1240		cmd.data0 = sc->num_slices;
1241		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1242#ifdef IFNET_BUF_RING
1243		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1244#endif
1245		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1246					   &cmd);
1247		if (status != 0) {
1248			device_printf(sc->dev,
1249				      "failed to set number of slices\n");
1250			return status;
1251		}
1252	}
1253
1254
1255	if (interrupts_setup) {
1256		/* Now exchange information about interrupts  */
1257		for (slice = 0; slice < sc->num_slices; slice++) {
1258			rx_done = &sc->ss[slice].rx_done;
1259			memset(rx_done->entry, 0, sc->rx_ring_size);
1260			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1261			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1262			cmd.data2 = slice;
1263			status |= mxge_send_cmd(sc,
1264						MXGEFW_CMD_SET_INTRQ_DMA,
1265						&cmd);
1266		}
1267	}
1268
1269	status |= mxge_send_cmd(sc,
1270				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1271
1272
1273	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1274
1275	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1276	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1277
1278
1279	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1280				&cmd);
1281	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1282	if (status != 0) {
1283		device_printf(sc->dev, "failed set interrupt parameters\n");
1284		return status;
1285	}
1286
1287
1288	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1289
1290
1291	/* run a DMA benchmark */
1292	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1293
1294	for (slice = 0; slice < sc->num_slices; slice++) {
1295		ss = &sc->ss[slice];
1296
1297		ss->irq_claim = irq_claim + (2 * slice);
1298		/* reset mcp/driver shared state back to 0 */
1299		ss->rx_done.idx = 0;
1300		ss->rx_done.cnt = 0;
1301		ss->tx.req = 0;
1302		ss->tx.done = 0;
1303		ss->tx.pkt_done = 0;
1304		ss->tx.queue_active = 0;
1305		ss->tx.activate = 0;
1306		ss->tx.deactivate = 0;
1307		ss->tx.wake = 0;
1308		ss->tx.defrag = 0;
1309		ss->tx.stall = 0;
1310		ss->rx_big.cnt = 0;
1311		ss->rx_small.cnt = 0;
1312		ss->lro_bad_csum = 0;
1313		ss->lro_queued = 0;
1314		ss->lro_flushed = 0;
1315		if (ss->fw_stats != NULL) {
1316			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1317		}
1318	}
1319	sc->rdma_tags_available = 15;
1320	status = mxge_update_mac_address(sc);
1321	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1322	mxge_change_pause(sc, sc->pause);
1323	mxge_set_multicast_list(sc);
1324	if (sc->throttle) {
1325		cmd.data0 = sc->throttle;
1326		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1327				  &cmd)) {
1328			device_printf(sc->dev,
1329				      "can't enable throttle\n");
1330		}
1331	}
1332	return status;
1333}
1334
1335static int
1336mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1337{
1338	mxge_cmd_t cmd;
1339	mxge_softc_t *sc;
1340	int err;
1341	unsigned int throttle;
1342
1343	sc = arg1;
1344	throttle = sc->throttle;
1345	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1346        if (err != 0) {
1347                return err;
1348        }
1349
1350	if (throttle == sc->throttle)
1351		return 0;
1352
1353        if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1354                return EINVAL;
1355
1356	mtx_lock(&sc->driver_mtx);
1357	cmd.data0 = throttle;
1358	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1359	if (err == 0)
1360		sc->throttle = throttle;
1361	mtx_unlock(&sc->driver_mtx);
1362	return err;
1363}
1364
1365static int
1366mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1367{
1368        mxge_softc_t *sc;
1369        unsigned int intr_coal_delay;
1370        int err;
1371
1372        sc = arg1;
1373        intr_coal_delay = sc->intr_coal_delay;
1374        err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1375        if (err != 0) {
1376                return err;
1377        }
1378        if (intr_coal_delay == sc->intr_coal_delay)
1379                return 0;
1380
1381        if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1382                return EINVAL;
1383
1384	mtx_lock(&sc->driver_mtx);
1385	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1386	sc->intr_coal_delay = intr_coal_delay;
1387
1388	mtx_unlock(&sc->driver_mtx);
1389        return err;
1390}
1391
1392static int
1393mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1394{
1395        mxge_softc_t *sc;
1396        unsigned int enabled;
1397        int err;
1398
1399        sc = arg1;
1400        enabled = sc->pause;
1401        err = sysctl_handle_int(oidp, &enabled, arg2, req);
1402        if (err != 0) {
1403                return err;
1404        }
1405        if (enabled == sc->pause)
1406                return 0;
1407
1408	mtx_lock(&sc->driver_mtx);
1409	err = mxge_change_pause(sc, enabled);
1410	mtx_unlock(&sc->driver_mtx);
1411        return err;
1412}
1413
1414static int
1415mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1416{
1417	struct ifnet *ifp;
1418	int err = 0;
1419
1420	ifp = sc->ifp;
1421	if (lro_cnt == 0)
1422		ifp->if_capenable &= ~IFCAP_LRO;
1423	else
1424		ifp->if_capenable |= IFCAP_LRO;
1425	sc->lro_cnt = lro_cnt;
1426	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1427		mxge_close(sc, 0);
1428		err = mxge_open(sc);
1429	}
1430	return err;
1431}
1432
1433static int
1434mxge_change_lro(SYSCTL_HANDLER_ARGS)
1435{
1436	mxge_softc_t *sc;
1437	unsigned int lro_cnt;
1438	int err;
1439
1440	sc = arg1;
1441	lro_cnt = sc->lro_cnt;
1442	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1443	if (err != 0)
1444		return err;
1445
1446	if (lro_cnt == sc->lro_cnt)
1447		return 0;
1448
1449	if (lro_cnt > 128)
1450		return EINVAL;
1451
1452	mtx_lock(&sc->driver_mtx);
1453	err = mxge_change_lro_locked(sc, lro_cnt);
1454	mtx_unlock(&sc->driver_mtx);
1455	return err;
1456}
1457
1458static int
1459mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1460{
1461        int err;
1462
1463        if (arg1 == NULL)
1464                return EFAULT;
1465        arg2 = be32toh(*(int *)arg1);
1466        arg1 = NULL;
1467        err = sysctl_handle_int(oidp, arg1, arg2, req);
1468
1469        return err;
1470}
1471
1472static void
1473mxge_rem_sysctls(mxge_softc_t *sc)
1474{
1475	struct mxge_slice_state *ss;
1476	int slice;
1477
1478	if (sc->slice_sysctl_tree == NULL)
1479		return;
1480
1481	for (slice = 0; slice < sc->num_slices; slice++) {
1482		ss = &sc->ss[slice];
1483		if (ss == NULL || ss->sysctl_tree == NULL)
1484			continue;
1485		sysctl_ctx_free(&ss->sysctl_ctx);
1486		ss->sysctl_tree = NULL;
1487	}
1488	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1489	sc->slice_sysctl_tree = NULL;
1490}
1491
1492static void
1493mxge_add_sysctls(mxge_softc_t *sc)
1494{
1495	struct sysctl_ctx_list *ctx;
1496	struct sysctl_oid_list *children;
1497	mcp_irq_data_t *fw;
1498	struct mxge_slice_state *ss;
1499	int slice;
1500	char slice_num[8];
1501
1502	ctx = device_get_sysctl_ctx(sc->dev);
1503	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1504	fw = sc->ss[0].fw_stats;
1505
1506	/* random information */
1507	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1508		       "firmware_version",
1509		       CTLFLAG_RD, &sc->fw_version,
1510		       0, "firmware version");
1511	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1512		       "serial_number",
1513		       CTLFLAG_RD, &sc->serial_number_string,
1514		       0, "serial number");
1515	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1516		       "product_code",
1517		       CTLFLAG_RD, &sc->product_code_string,
1518		       0, "product_code");
1519	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1520		       "pcie_link_width",
1521		       CTLFLAG_RD, &sc->link_width,
1522		       0, "tx_boundary");
1523	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1524		       "tx_boundary",
1525		       CTLFLAG_RD, &sc->tx_boundary,
1526		       0, "tx_boundary");
1527	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1528		       "write_combine",
1529		       CTLFLAG_RD, &sc->wc,
1530		       0, "write combining PIO?");
1531	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1532		       "read_dma_MBs",
1533		       CTLFLAG_RD, &sc->read_dma,
1534		       0, "DMA Read speed in MB/s");
1535	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1536		       "write_dma_MBs",
1537		       CTLFLAG_RD, &sc->write_dma,
1538		       0, "DMA Write speed in MB/s");
1539	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1540		       "read_write_dma_MBs",
1541		       CTLFLAG_RD, &sc->read_write_dma,
1542		       0, "DMA concurrent Read/Write speed in MB/s");
1543	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1544		       "watchdog_resets",
1545		       CTLFLAG_RD, &sc->watchdog_resets,
1546		       0, "Number of times NIC was reset");
1547
1548
1549	/* performance related tunables */
1550	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1551			"intr_coal_delay",
1552			CTLTYPE_INT|CTLFLAG_RW, sc,
1553			0, mxge_change_intr_coal,
1554			"I", "interrupt coalescing delay in usecs");
1555
1556	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1557			"throttle",
1558			CTLTYPE_INT|CTLFLAG_RW, sc,
1559			0, mxge_change_throttle,
1560			"I", "transmit throttling");
1561
1562	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1563			"flow_control_enabled",
1564			CTLTYPE_INT|CTLFLAG_RW, sc,
1565			0, mxge_change_flow_control,
1566			"I", "interrupt coalescing delay in usecs");
1567
1568	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1569		       "deassert_wait",
1570		       CTLFLAG_RW, &mxge_deassert_wait,
1571		       0, "Wait for IRQ line to go low in ihandler");
1572
1573	/* stats block from firmware is in network byte order.
1574	   Need to swap it */
1575	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1576			"link_up",
1577			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1578			0, mxge_handle_be32,
1579			"I", "link up");
1580	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1581			"rdma_tags_available",
1582			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1583			0, mxge_handle_be32,
1584			"I", "rdma_tags_available");
1585	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1586			"dropped_bad_crc32",
1587			CTLTYPE_INT|CTLFLAG_RD,
1588			&fw->dropped_bad_crc32,
1589			0, mxge_handle_be32,
1590			"I", "dropped_bad_crc32");
1591	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1592			"dropped_bad_phy",
1593			CTLTYPE_INT|CTLFLAG_RD,
1594			&fw->dropped_bad_phy,
1595			0, mxge_handle_be32,
1596			"I", "dropped_bad_phy");
1597	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1598			"dropped_link_error_or_filtered",
1599			CTLTYPE_INT|CTLFLAG_RD,
1600			&fw->dropped_link_error_or_filtered,
1601			0, mxge_handle_be32,
1602			"I", "dropped_link_error_or_filtered");
1603	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1604			"dropped_link_overflow",
1605			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1606			0, mxge_handle_be32,
1607			"I", "dropped_link_overflow");
1608	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1609			"dropped_multicast_filtered",
1610			CTLTYPE_INT|CTLFLAG_RD,
1611			&fw->dropped_multicast_filtered,
1612			0, mxge_handle_be32,
1613			"I", "dropped_multicast_filtered");
1614	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1615			"dropped_no_big_buffer",
1616			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1617			0, mxge_handle_be32,
1618			"I", "dropped_no_big_buffer");
1619	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1620			"dropped_no_small_buffer",
1621			CTLTYPE_INT|CTLFLAG_RD,
1622			&fw->dropped_no_small_buffer,
1623			0, mxge_handle_be32,
1624			"I", "dropped_no_small_buffer");
1625	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1626			"dropped_overrun",
1627			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1628			0, mxge_handle_be32,
1629			"I", "dropped_overrun");
1630	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1631			"dropped_pause",
1632			CTLTYPE_INT|CTLFLAG_RD,
1633			&fw->dropped_pause,
1634			0, mxge_handle_be32,
1635			"I", "dropped_pause");
1636	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1637			"dropped_runt",
1638			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1639			0, mxge_handle_be32,
1640			"I", "dropped_runt");
1641
1642	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1643			"dropped_unicast_filtered",
1644			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1645			0, mxge_handle_be32,
1646			"I", "dropped_unicast_filtered");
1647
1648	/* verbose printing? */
1649	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1650		       "verbose",
1651		       CTLFLAG_RW, &mxge_verbose,
1652		       0, "verbose printing");
1653
1654	/* lro */
1655	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1656			"lro_cnt",
1657			CTLTYPE_INT|CTLFLAG_RW, sc,
1658			0, mxge_change_lro,
1659			"I", "number of lro merge queues");
1660
1661
1662	/* add counters exported for debugging from all slices */
1663	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1664	sc->slice_sysctl_tree =
1665		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1666				"slice", CTLFLAG_RD, 0, "");
1667
1668	for (slice = 0; slice < sc->num_slices; slice++) {
1669		ss = &sc->ss[slice];
1670		sysctl_ctx_init(&ss->sysctl_ctx);
1671		ctx = &ss->sysctl_ctx;
1672		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1673		sprintf(slice_num, "%d", slice);
1674		ss->sysctl_tree =
1675			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1676					CTLFLAG_RD, 0, "");
1677		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1678		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1679			       "rx_small_cnt",
1680			       CTLFLAG_RD, &ss->rx_small.cnt,
1681			       0, "rx_small_cnt");
1682		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1683			       "rx_big_cnt",
1684			       CTLFLAG_RD, &ss->rx_big.cnt,
1685			       0, "rx_small_cnt");
1686		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1687			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1688			       0, "number of lro merge queues flushed");
1689
1690		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1691			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1692			       0, "number of frames appended to lro merge"
1693			       "queues");
1694
1695#ifndef IFNET_BUF_RING
1696		/* only transmit from slice 0 for now */
1697		if (slice > 0)
1698			continue;
1699#endif
1700		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1701			       "tx_req",
1702			       CTLFLAG_RD, &ss->tx.req,
1703			       0, "tx_req");
1704
1705		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1706			       "tx_done",
1707			       CTLFLAG_RD, &ss->tx.done,
1708			       0, "tx_done");
1709		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1710			       "tx_pkt_done",
1711			       CTLFLAG_RD, &ss->tx.pkt_done,
1712			       0, "tx_done");
1713		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1714			       "tx_stall",
1715			       CTLFLAG_RD, &ss->tx.stall,
1716			       0, "tx_stall");
1717		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1718			       "tx_wake",
1719			       CTLFLAG_RD, &ss->tx.wake,
1720			       0, "tx_wake");
1721		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1722			       "tx_defrag",
1723			       CTLFLAG_RD, &ss->tx.defrag,
1724			       0, "tx_defrag");
1725		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1726			       "tx_queue_active",
1727			       CTLFLAG_RD, &ss->tx.queue_active,
1728			       0, "tx_queue_active");
1729		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1730			       "tx_activate",
1731			       CTLFLAG_RD, &ss->tx.activate,
1732			       0, "tx_activate");
1733		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1734			       "tx_deactivate",
1735			       CTLFLAG_RD, &ss->tx.deactivate,
1736			       0, "tx_deactivate");
1737	}
1738}
1739
1740/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1741   backwards one at a time and handle ring wraps */
1742
1743static inline void
1744mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1745			    mcp_kreq_ether_send_t *src, int cnt)
1746{
1747        int idx, starting_slot;
1748        starting_slot = tx->req;
1749        while (cnt > 1) {
1750                cnt--;
1751                idx = (starting_slot + cnt) & tx->mask;
1752                mxge_pio_copy(&tx->lanai[idx],
1753			      &src[cnt], sizeof(*src));
1754                wmb();
1755        }
1756}
1757
1758/*
1759 * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1760 * at most 32 bytes at a time, so as to avoid involving the software
1761 * pio handler in the nic.   We re-write the first segment's flags
1762 * to mark them valid only after writing the entire chain
1763 */
1764
1765static inline void
1766mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1767                  int cnt)
1768{
1769        int idx, i;
1770        uint32_t *src_ints;
1771	volatile uint32_t *dst_ints;
1772        mcp_kreq_ether_send_t *srcp;
1773	volatile mcp_kreq_ether_send_t *dstp, *dst;
1774	uint8_t last_flags;
1775
1776        idx = tx->req & tx->mask;
1777
1778	last_flags = src->flags;
1779	src->flags = 0;
1780        wmb();
1781        dst = dstp = &tx->lanai[idx];
1782        srcp = src;
1783
1784        if ((idx + cnt) < tx->mask) {
1785                for (i = 0; i < (cnt - 1); i += 2) {
1786                        mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1787                        wmb(); /* force write every 32 bytes */
1788                        srcp += 2;
1789                        dstp += 2;
1790                }
1791        } else {
1792                /* submit all but the first request, and ensure
1793                   that it is submitted below */
1794                mxge_submit_req_backwards(tx, src, cnt);
1795                i = 0;
1796        }
1797        if (i < cnt) {
1798                /* submit the first request */
1799                mxge_pio_copy(dstp, srcp, sizeof(*src));
1800                wmb(); /* barrier before setting valid flag */
1801        }
1802
1803        /* re-write the last 32-bits with the valid flags */
1804        src->flags = last_flags;
1805        src_ints = (uint32_t *)src;
1806        src_ints+=3;
1807        dst_ints = (volatile uint32_t *)dst;
1808        dst_ints+=3;
1809        *dst_ints =  *src_ints;
1810        tx->req += cnt;
1811        wmb();
1812}
1813
1814#if IFCAP_TSO4
1815
1816static void
1817mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1818	       int busdma_seg_cnt, int ip_off)
1819{
1820	mxge_tx_ring_t *tx;
1821	mcp_kreq_ether_send_t *req;
1822	bus_dma_segment_t *seg;
1823	struct ip *ip;
1824	struct tcphdr *tcp;
1825	uint32_t low, high_swapped;
1826	int len, seglen, cum_len, cum_len_next;
1827	int next_is_first, chop, cnt, rdma_count, small;
1828	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1829	uint8_t flags, flags_next;
1830	static int once;
1831
1832	mss = m->m_pkthdr.tso_segsz;
1833
1834	/* negative cum_len signifies to the
1835	 * send loop that we are still in the
1836	 * header portion of the TSO packet.
1837	 */
1838
1839	/* ensure we have the ethernet, IP and TCP
1840	   header together in the first mbuf, copy
1841	   it to a scratch buffer if not */
1842	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1843		m_copydata(m, 0, ip_off + sizeof (*ip),
1844			   ss->scratch);
1845		ip = (struct ip *)(ss->scratch + ip_off);
1846	} else {
1847		ip = (struct ip *)(mtod(m, char *) + ip_off);
1848	}
1849	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1850			    + sizeof (*tcp))) {
1851		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1852			   + sizeof (*tcp),  ss->scratch);
1853		ip = (struct ip *)(mtod(m, char *) + ip_off);
1854	}
1855
1856	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1857	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1858	cksum_offset = ip_off + (ip->ip_hl << 2);
1859
1860	/* TSO implies checksum offload on this hardware */
1861	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP)) == 0)) {
1862		/*
1863		 * If packet has full TCP csum, replace it with pseudo hdr
1864		 * sum that the NIC expects, otherwise the NIC will emit
1865		 * packets with bad TCP checksums.
1866		 */
1867		m->m_pkthdr.csum_flags = CSUM_TCP;
1868		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1869		tcp->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1870			htons(IPPROTO_TCP + (m->m_pkthdr.len - cksum_offset)));
1871	}
1872	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1873
1874
1875	/* for TSO, pseudo_hdr_offset holds mss.
1876	 * The firmware figures out where to put
1877	 * the checksum by parsing the header. */
1878	pseudo_hdr_offset = htobe16(mss);
1879
1880	tx = &ss->tx;
1881	req = tx->req_list;
1882	seg = tx->seg_list;
1883	cnt = 0;
1884	rdma_count = 0;
1885	/* "rdma_count" is the number of RDMAs belonging to the
1886	 * current packet BEFORE the current send request. For
1887	 * non-TSO packets, this is equal to "count".
1888	 * For TSO packets, rdma_count needs to be reset
1889	 * to 0 after a segment cut.
1890	 *
1891	 * The rdma_count field of the send request is
1892	 * the number of RDMAs of the packet starting at
1893	 * that request. For TSO send requests with one ore more cuts
1894	 * in the middle, this is the number of RDMAs starting
1895	 * after the last cut in the request. All previous
1896	 * segments before the last cut implicitly have 1 RDMA.
1897	 *
1898	 * Since the number of RDMAs is not known beforehand,
1899	 * it must be filled-in retroactively - after each
1900	 * segmentation cut or at the end of the entire packet.
1901	 */
1902
1903	while (busdma_seg_cnt) {
1904		/* Break the busdma segment up into pieces*/
1905		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1906		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1907		len = seg->ds_len;
1908
1909		while (len) {
1910			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1911			seglen = len;
1912			cum_len_next = cum_len + seglen;
1913			(req-rdma_count)->rdma_count = rdma_count + 1;
1914			if (__predict_true(cum_len >= 0)) {
1915				/* payload */
1916				chop = (cum_len_next > mss);
1917				cum_len_next = cum_len_next % mss;
1918				next_is_first = (cum_len_next == 0);
1919				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1920				flags_next |= next_is_first *
1921					MXGEFW_FLAGS_FIRST;
1922				rdma_count |= -(chop | next_is_first);
1923				rdma_count += chop & !next_is_first;
1924			} else if (cum_len_next >= 0) {
1925				/* header ends */
1926				rdma_count = -1;
1927				cum_len_next = 0;
1928				seglen = -cum_len;
1929				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1930				flags_next = MXGEFW_FLAGS_TSO_PLD |
1931					MXGEFW_FLAGS_FIRST |
1932					(small * MXGEFW_FLAGS_SMALL);
1933			    }
1934
1935			req->addr_high = high_swapped;
1936			req->addr_low = htobe32(low);
1937			req->pseudo_hdr_offset = pseudo_hdr_offset;
1938			req->pad = 0;
1939			req->rdma_count = 1;
1940			req->length = htobe16(seglen);
1941			req->cksum_offset = cksum_offset;
1942			req->flags = flags | ((cum_len & 1) *
1943					      MXGEFW_FLAGS_ALIGN_ODD);
1944			low += seglen;
1945			len -= seglen;
1946			cum_len = cum_len_next;
1947			flags = flags_next;
1948			req++;
1949			cnt++;
1950			rdma_count++;
1951			if (__predict_false(cksum_offset > seglen))
1952				cksum_offset -= seglen;
1953			else
1954				cksum_offset = 0;
1955			if (__predict_false(cnt > tx->max_desc))
1956				goto drop;
1957		}
1958		busdma_seg_cnt--;
1959		seg++;
1960	}
1961	(req-rdma_count)->rdma_count = rdma_count;
1962
1963	do {
1964		req--;
1965		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1966	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1967
1968	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1969	mxge_submit_req(tx, tx->req_list, cnt);
1970#ifdef IFNET_BUF_RING
1971	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1972		/* tell the NIC to start polling this slice */
1973		*tx->send_go = 1;
1974		tx->queue_active = 1;
1975		tx->activate++;
1976		wmb();
1977	}
1978#endif
1979	return;
1980
1981drop:
1982	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1983	m_freem(m);
1984	ss->oerrors++;
1985	if (!once) {
1986		printf("tx->max_desc exceeded via TSO!\n");
1987		printf("mss = %d, %ld, %d!\n", mss,
1988		       (long)seg - (long)tx->seg_list, tx->max_desc);
1989		once = 1;
1990	}
1991	return;
1992
1993}
1994
1995#endif /* IFCAP_TSO4 */
1996
1997#ifdef MXGE_NEW_VLAN_API
1998/*
1999 * We reproduce the software vlan tag insertion from
2000 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2001 * vlan tag insertion. We need to advertise this in order to have the
2002 * vlan interface respect our csum offload flags.
2003 */
2004static struct mbuf *
2005mxge_vlan_tag_insert(struct mbuf *m)
2006{
2007	struct ether_vlan_header *evl;
2008
2009	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
2010	if (__predict_false(m == NULL))
2011		return NULL;
2012	if (m->m_len < sizeof(*evl)) {
2013		m = m_pullup(m, sizeof(*evl));
2014		if (__predict_false(m == NULL))
2015			return NULL;
2016	}
2017	/*
2018	 * Transform the Ethernet header into an Ethernet header
2019	 * with 802.1Q encapsulation.
2020	 */
2021	evl = mtod(m, struct ether_vlan_header *);
2022	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2023	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2024	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2025	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2026	m->m_flags &= ~M_VLANTAG;
2027	return m;
2028}
2029#endif /* MXGE_NEW_VLAN_API */
2030
2031static void
2032mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2033{
2034	mxge_softc_t *sc;
2035	mcp_kreq_ether_send_t *req;
2036	bus_dma_segment_t *seg;
2037	struct mbuf *m_tmp;
2038	struct ifnet *ifp;
2039	mxge_tx_ring_t *tx;
2040	struct ip *ip;
2041	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2042	uint16_t pseudo_hdr_offset;
2043        uint8_t flags, cksum_offset;
2044
2045
2046	sc = ss->sc;
2047	ifp = sc->ifp;
2048	tx = &ss->tx;
2049
2050	ip_off = sizeof (struct ether_header);
2051#ifdef MXGE_NEW_VLAN_API
2052	if (m->m_flags & M_VLANTAG) {
2053		m = mxge_vlan_tag_insert(m);
2054		if (__predict_false(m == NULL))
2055			goto drop;
2056		ip_off += ETHER_VLAN_ENCAP_LEN;
2057	}
2058#endif
2059	/* (try to) map the frame for DMA */
2060	idx = tx->req & tx->mask;
2061	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2062				      m, tx->seg_list, &cnt,
2063				      BUS_DMA_NOWAIT);
2064	if (__predict_false(err == EFBIG)) {
2065		/* Too many segments in the chain.  Try
2066		   to defrag */
2067		m_tmp = m_defrag(m, M_NOWAIT);
2068		if (m_tmp == NULL) {
2069			goto drop;
2070		}
2071		ss->tx.defrag++;
2072		m = m_tmp;
2073		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2074					      tx->info[idx].map,
2075					      m, tx->seg_list, &cnt,
2076					      BUS_DMA_NOWAIT);
2077	}
2078	if (__predict_false(err != 0)) {
2079		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2080			      " packet len = %d\n", err, m->m_pkthdr.len);
2081		goto drop;
2082	}
2083	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2084			BUS_DMASYNC_PREWRITE);
2085	tx->info[idx].m = m;
2086
2087#if IFCAP_TSO4
2088	/* TSO is different enough, we handle it in another routine */
2089	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2090		mxge_encap_tso(ss, m, cnt, ip_off);
2091		return;
2092	}
2093#endif
2094
2095	req = tx->req_list;
2096	cksum_offset = 0;
2097	pseudo_hdr_offset = 0;
2098	flags = MXGEFW_FLAGS_NO_TSO;
2099
2100	/* checksum offloading? */
2101	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2102		/* ensure ip header is in first mbuf, copy
2103		   it to a scratch buffer if not */
2104		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2105			m_copydata(m, 0, ip_off + sizeof (*ip),
2106				   ss->scratch);
2107			ip = (struct ip *)(ss->scratch + ip_off);
2108		} else {
2109			ip = (struct ip *)(mtod(m, char *) + ip_off);
2110		}
2111		cksum_offset = ip_off + (ip->ip_hl << 2);
2112		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2113		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2114		req->cksum_offset = cksum_offset;
2115		flags |= MXGEFW_FLAGS_CKSUM;
2116		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2117	} else {
2118		odd_flag = 0;
2119	}
2120	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2121		flags |= MXGEFW_FLAGS_SMALL;
2122
2123	/* convert segments into a request list */
2124	cum_len = 0;
2125	seg = tx->seg_list;
2126	req->flags = MXGEFW_FLAGS_FIRST;
2127	for (i = 0; i < cnt; i++) {
2128		req->addr_low =
2129			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2130		req->addr_high =
2131			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2132		req->length = htobe16(seg->ds_len);
2133		req->cksum_offset = cksum_offset;
2134		if (cksum_offset > seg->ds_len)
2135			cksum_offset -= seg->ds_len;
2136		else
2137			cksum_offset = 0;
2138		req->pseudo_hdr_offset = pseudo_hdr_offset;
2139		req->pad = 0; /* complete solid 16-byte block */
2140		req->rdma_count = 1;
2141		req->flags |= flags | ((cum_len & 1) * odd_flag);
2142		cum_len += seg->ds_len;
2143		seg++;
2144		req++;
2145		req->flags = 0;
2146	}
2147	req--;
2148	/* pad runts to 60 bytes */
2149	if (cum_len < 60) {
2150		req++;
2151		req->addr_low =
2152			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2153		req->addr_high =
2154			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2155		req->length = htobe16(60 - cum_len);
2156		req->cksum_offset = 0;
2157		req->pseudo_hdr_offset = pseudo_hdr_offset;
2158		req->pad = 0; /* complete solid 16-byte block */
2159		req->rdma_count = 1;
2160		req->flags |= flags | ((cum_len & 1) * odd_flag);
2161		cnt++;
2162	}
2163
2164	tx->req_list[0].rdma_count = cnt;
2165#if 0
2166	/* print what the firmware will see */
2167	for (i = 0; i < cnt; i++) {
2168		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2169		    "cso:%d, flags:0x%x, rdma:%d\n",
2170		    i, (int)ntohl(tx->req_list[i].addr_high),
2171		    (int)ntohl(tx->req_list[i].addr_low),
2172		    (int)ntohs(tx->req_list[i].length),
2173		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2174		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2175		    tx->req_list[i].rdma_count);
2176	}
2177	printf("--------------\n");
2178#endif
2179	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2180	mxge_submit_req(tx, tx->req_list, cnt);
2181#ifdef IFNET_BUF_RING
2182	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2183		/* tell the NIC to start polling this slice */
2184		*tx->send_go = 1;
2185		tx->queue_active = 1;
2186		tx->activate++;
2187		wmb();
2188	}
2189#endif
2190	return;
2191
2192drop:
2193	m_freem(m);
2194	ss->oerrors++;
2195	return;
2196}
2197
2198#ifdef IFNET_BUF_RING
2199static void
2200mxge_qflush(struct ifnet *ifp)
2201{
2202	mxge_softc_t *sc = ifp->if_softc;
2203	mxge_tx_ring_t *tx;
2204	struct mbuf *m;
2205	int slice;
2206
2207	for (slice = 0; slice < sc->num_slices; slice++) {
2208		tx = &sc->ss[slice].tx;
2209		mtx_lock(&tx->mtx);
2210		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2211			m_freem(m);
2212		mtx_unlock(&tx->mtx);
2213	}
2214	if_qflush(ifp);
2215}
2216
2217static inline void
2218mxge_start_locked(struct mxge_slice_state *ss)
2219{
2220	mxge_softc_t *sc;
2221	struct mbuf *m;
2222	struct ifnet *ifp;
2223	mxge_tx_ring_t *tx;
2224
2225	sc = ss->sc;
2226	ifp = sc->ifp;
2227	tx = &ss->tx;
2228
2229	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2230		m = drbr_dequeue(ifp, tx->br);
2231		if (m == NULL) {
2232			return;
2233		}
2234		/* let BPF see it */
2235		BPF_MTAP(ifp, m);
2236
2237		/* give it to the nic */
2238		mxge_encap(ss, m);
2239	}
2240	/* ran out of transmit slots */
2241	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2242	    && (!drbr_empty(ifp, tx->br))) {
2243		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2244		tx->stall++;
2245	}
2246}
2247
2248static int
2249mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2250{
2251	mxge_softc_t *sc;
2252	struct ifnet *ifp;
2253	mxge_tx_ring_t *tx;
2254	int err;
2255
2256	sc = ss->sc;
2257	ifp = sc->ifp;
2258	tx = &ss->tx;
2259
2260	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2261	    IFF_DRV_RUNNING) {
2262		err = drbr_enqueue(ifp, tx->br, m);
2263		return (err);
2264	}
2265
2266	if (!drbr_needs_enqueue(ifp, tx->br) &&
2267	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2268		/* let BPF see it */
2269		BPF_MTAP(ifp, m);
2270		/* give it to the nic */
2271		mxge_encap(ss, m);
2272	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2273		return (err);
2274	}
2275	if (!drbr_empty(ifp, tx->br))
2276		mxge_start_locked(ss);
2277	return (0);
2278}
2279
2280static int
2281mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2282{
2283	mxge_softc_t *sc = ifp->if_softc;
2284	struct mxge_slice_state *ss;
2285	mxge_tx_ring_t *tx;
2286	int err = 0;
2287	int slice;
2288
2289	slice = m->m_pkthdr.flowid;
2290	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2291
2292	ss = &sc->ss[slice];
2293	tx = &ss->tx;
2294
2295	if (mtx_trylock(&tx->mtx)) {
2296		err = mxge_transmit_locked(ss, m);
2297		mtx_unlock(&tx->mtx);
2298	} else {
2299		err = drbr_enqueue(ifp, tx->br, m);
2300	}
2301
2302	return (err);
2303}
2304
2305#else
2306
2307static inline void
2308mxge_start_locked(struct mxge_slice_state *ss)
2309{
2310	mxge_softc_t *sc;
2311	struct mbuf *m;
2312	struct ifnet *ifp;
2313	mxge_tx_ring_t *tx;
2314
2315	sc = ss->sc;
2316	ifp = sc->ifp;
2317	tx = &ss->tx;
2318	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2319		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2320		if (m == NULL) {
2321			return;
2322		}
2323		/* let BPF see it */
2324		BPF_MTAP(ifp, m);
2325
2326		/* give it to the nic */
2327		mxge_encap(ss, m);
2328	}
2329	/* ran out of transmit slots */
2330	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2331		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2332		tx->stall++;
2333	}
2334}
2335#endif
2336static void
2337mxge_start(struct ifnet *ifp)
2338{
2339	mxge_softc_t *sc = ifp->if_softc;
2340	struct mxge_slice_state *ss;
2341
2342	/* only use the first slice for now */
2343	ss = &sc->ss[0];
2344	mtx_lock(&ss->tx.mtx);
2345	mxge_start_locked(ss);
2346	mtx_unlock(&ss->tx.mtx);
2347}
2348
2349/*
2350 * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2351 * at most 32 bytes at a time, so as to avoid involving the software
2352 * pio handler in the nic.   We re-write the first segment's low
2353 * DMA address to mark it valid only after we write the entire chunk
2354 * in a burst
2355 */
2356static inline void
2357mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2358		mcp_kreq_ether_recv_t *src)
2359{
2360	uint32_t low;
2361
2362	low = src->addr_low;
2363	src->addr_low = 0xffffffff;
2364	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2365	wmb();
2366	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2367	wmb();
2368	src->addr_low = low;
2369	dst->addr_low = low;
2370	wmb();
2371}
2372
2373static int
2374mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2375{
2376	bus_dma_segment_t seg;
2377	struct mbuf *m;
2378	mxge_rx_ring_t *rx = &ss->rx_small;
2379	int cnt, err;
2380
2381	m = m_gethdr(M_DONTWAIT, MT_DATA);
2382	if (m == NULL) {
2383		rx->alloc_fail++;
2384		err = ENOBUFS;
2385		goto done;
2386	}
2387	m->m_len = MHLEN;
2388	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2389				      &seg, &cnt, BUS_DMA_NOWAIT);
2390	if (err != 0) {
2391		m_free(m);
2392		goto done;
2393	}
2394	rx->info[idx].m = m;
2395	rx->shadow[idx].addr_low =
2396		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2397	rx->shadow[idx].addr_high =
2398		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2399
2400done:
2401	if ((idx & 7) == 7)
2402		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2403	return err;
2404}
2405
2406static int
2407mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2408{
2409	bus_dma_segment_t seg[3];
2410	struct mbuf *m;
2411	mxge_rx_ring_t *rx = &ss->rx_big;
2412	int cnt, err, i;
2413
2414	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2415	if (m == NULL) {
2416		rx->alloc_fail++;
2417		err = ENOBUFS;
2418		goto done;
2419	}
2420	m->m_len = rx->mlen;
2421	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2422				      seg, &cnt, BUS_DMA_NOWAIT);
2423	if (err != 0) {
2424		m_free(m);
2425		goto done;
2426	}
2427	rx->info[idx].m = m;
2428	rx->shadow[idx].addr_low =
2429		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2430	rx->shadow[idx].addr_high =
2431		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2432
2433#if MXGE_VIRT_JUMBOS
2434	for (i = 1; i < cnt; i++) {
2435		rx->shadow[idx + i].addr_low =
2436			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2437		rx->shadow[idx + i].addr_high =
2438			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2439       }
2440#endif
2441
2442done:
2443       for (i = 0; i < rx->nbufs; i++) {
2444		if ((idx & 7) == 7) {
2445			mxge_submit_8rx(&rx->lanai[idx - 7],
2446					&rx->shadow[idx - 7]);
2447		}
2448		idx++;
2449	}
2450	return err;
2451}
2452
2453/*
2454 *  Myri10GE hardware checksums are not valid if the sender
2455 *  padded the frame with non-zero padding.  This is because
2456 *  the firmware just does a simple 16-bit 1s complement
2457 *  checksum across the entire frame, excluding the first 14
2458 *  bytes.  It is best to simply to check the checksum and
2459 *  tell the stack about it only if the checksum is good
2460 */
2461
2462static inline uint16_t
2463mxge_rx_csum(struct mbuf *m, int csum)
2464{
2465	struct ether_header *eh;
2466	struct ip *ip;
2467	uint16_t c;
2468
2469	eh = mtod(m, struct ether_header *);
2470
2471	/* only deal with IPv4 TCP & UDP for now */
2472	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2473		return 1;
2474	ip = (struct ip *)(eh + 1);
2475	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2476			    ip->ip_p != IPPROTO_UDP))
2477		return 1;
2478#ifdef INET
2479	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2480		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2481			    - (ip->ip_hl << 2) + ip->ip_p));
2482#else
2483	c = 1;
2484#endif
2485	c ^= 0xffff;
2486	return (c);
2487}
2488
2489static void
2490mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2491{
2492	struct ether_vlan_header *evl;
2493	struct ether_header *eh;
2494	uint32_t partial;
2495
2496	evl = mtod(m, struct ether_vlan_header *);
2497	eh = mtod(m, struct ether_header *);
2498
2499	/*
2500	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2501	 * after what the firmware thought was the end of the ethernet
2502	 * header.
2503	 */
2504
2505	/* put checksum into host byte order */
2506	*csum = ntohs(*csum);
2507	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2508	(*csum) += ~partial;
2509	(*csum) +=  ((*csum) < ~partial);
2510	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2511	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2512
2513	/* restore checksum to network byte order;
2514	   later consumers expect this */
2515	*csum = htons(*csum);
2516
2517	/* save the tag */
2518#ifdef MXGE_NEW_VLAN_API
2519	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2520#else
2521	{
2522		struct m_tag *mtag;
2523		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2524				   M_NOWAIT);
2525		if (mtag == NULL)
2526			return;
2527		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2528		m_tag_prepend(m, mtag);
2529	}
2530
2531#endif
2532	m->m_flags |= M_VLANTAG;
2533
2534	/*
2535	 * Remove the 802.1q header by copying the Ethernet
2536	 * addresses over it and adjusting the beginning of
2537	 * the data in the mbuf.  The encapsulated Ethernet
2538	 * type field is already in place.
2539	 */
2540	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2541	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2542	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2543}
2544
2545
2546static inline void
2547mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2548{
2549	mxge_softc_t *sc;
2550	struct ifnet *ifp;
2551	struct mbuf *m;
2552	struct ether_header *eh;
2553	mxge_rx_ring_t *rx;
2554	bus_dmamap_t old_map;
2555	int idx;
2556	uint16_t tcpudp_csum;
2557
2558	sc = ss->sc;
2559	ifp = sc->ifp;
2560	rx = &ss->rx_big;
2561	idx = rx->cnt & rx->mask;
2562	rx->cnt += rx->nbufs;
2563	/* save a pointer to the received mbuf */
2564	m = rx->info[idx].m;
2565	/* try to replace the received mbuf */
2566	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2567		/* drop the frame -- the old mbuf is re-cycled */
2568		ifp->if_ierrors++;
2569		return;
2570	}
2571
2572	/* unmap the received buffer */
2573	old_map = rx->info[idx].map;
2574	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2575	bus_dmamap_unload(rx->dmat, old_map);
2576
2577	/* swap the bus_dmamap_t's */
2578	rx->info[idx].map = rx->extra_map;
2579	rx->extra_map = old_map;
2580
2581	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2582	 * aligned */
2583	m->m_data += MXGEFW_PAD;
2584
2585	m->m_pkthdr.rcvif = ifp;
2586	m->m_len = m->m_pkthdr.len = len;
2587	ss->ipackets++;
2588	eh = mtod(m, struct ether_header *);
2589	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2590		mxge_vlan_tag_remove(m, &csum);
2591	}
2592	/* if the checksum is valid, mark it in the mbuf header */
2593	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2594		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2595			return;
2596		/* otherwise, it was a UDP frame, or a TCP frame which
2597		   we could not do LRO on.  Tell the stack that the
2598		   checksum is good */
2599		m->m_pkthdr.csum_data = 0xffff;
2600		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2601	}
2602	/* flowid only valid if RSS hashing is enabled */
2603	if (sc->num_slices > 1) {
2604		m->m_pkthdr.flowid = (ss - sc->ss);
2605		m->m_flags |= M_FLOWID;
2606	}
2607	/* pass the frame up the stack */
2608	(*ifp->if_input)(ifp, m);
2609}
2610
2611static inline void
2612mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2613{
2614	mxge_softc_t *sc;
2615	struct ifnet *ifp;
2616	struct ether_header *eh;
2617	struct mbuf *m;
2618	mxge_rx_ring_t *rx;
2619	bus_dmamap_t old_map;
2620	int idx;
2621	uint16_t tcpudp_csum;
2622
2623	sc = ss->sc;
2624	ifp = sc->ifp;
2625	rx = &ss->rx_small;
2626	idx = rx->cnt & rx->mask;
2627	rx->cnt++;
2628	/* save a pointer to the received mbuf */
2629	m = rx->info[idx].m;
2630	/* try to replace the received mbuf */
2631	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2632		/* drop the frame -- the old mbuf is re-cycled */
2633		ifp->if_ierrors++;
2634		return;
2635	}
2636
2637	/* unmap the received buffer */
2638	old_map = rx->info[idx].map;
2639	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2640	bus_dmamap_unload(rx->dmat, old_map);
2641
2642	/* swap the bus_dmamap_t's */
2643	rx->info[idx].map = rx->extra_map;
2644	rx->extra_map = old_map;
2645
2646	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2647	 * aligned */
2648	m->m_data += MXGEFW_PAD;
2649
2650	m->m_pkthdr.rcvif = ifp;
2651	m->m_len = m->m_pkthdr.len = len;
2652	ss->ipackets++;
2653	eh = mtod(m, struct ether_header *);
2654	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2655		mxge_vlan_tag_remove(m, &csum);
2656	}
2657	/* if the checksum is valid, mark it in the mbuf header */
2658	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2659		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2660			return;
2661		/* otherwise, it was a UDP frame, or a TCP frame which
2662		   we could not do LRO on.  Tell the stack that the
2663		   checksum is good */
2664		m->m_pkthdr.csum_data = 0xffff;
2665		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2666	}
2667	/* flowid only valid if RSS hashing is enabled */
2668	if (sc->num_slices > 1) {
2669		m->m_pkthdr.flowid = (ss - sc->ss);
2670		m->m_flags |= M_FLOWID;
2671	}
2672	/* pass the frame up the stack */
2673	(*ifp->if_input)(ifp, m);
2674}
2675
2676static inline void
2677mxge_clean_rx_done(struct mxge_slice_state *ss)
2678{
2679	mxge_rx_done_t *rx_done = &ss->rx_done;
2680	int limit = 0;
2681	uint16_t length;
2682	uint16_t checksum;
2683
2684
2685	while (rx_done->entry[rx_done->idx].length != 0) {
2686		length = ntohs(rx_done->entry[rx_done->idx].length);
2687		rx_done->entry[rx_done->idx].length = 0;
2688		checksum = rx_done->entry[rx_done->idx].checksum;
2689		if (length <= (MHLEN - MXGEFW_PAD))
2690			mxge_rx_done_small(ss, length, checksum);
2691		else
2692			mxge_rx_done_big(ss, length, checksum);
2693		rx_done->cnt++;
2694		rx_done->idx = rx_done->cnt & rx_done->mask;
2695
2696		/* limit potential for livelock */
2697		if (__predict_false(++limit > rx_done->mask / 2))
2698			break;
2699	}
2700#ifdef INET
2701	while (!SLIST_EMPTY(&ss->lro_active)) {
2702		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2703		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2704		mxge_lro_flush(ss, lro);
2705	}
2706#endif
2707}
2708
2709
2710static inline void
2711mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2712{
2713	struct ifnet *ifp;
2714	mxge_tx_ring_t *tx;
2715	struct mbuf *m;
2716	bus_dmamap_t map;
2717	int idx;
2718	int *flags;
2719
2720	tx = &ss->tx;
2721	ifp = ss->sc->ifp;
2722	while (tx->pkt_done != mcp_idx) {
2723		idx = tx->done & tx->mask;
2724		tx->done++;
2725		m = tx->info[idx].m;
2726		/* mbuf and DMA map only attached to the first
2727		   segment per-mbuf */
2728		if (m != NULL) {
2729			ss->obytes += m->m_pkthdr.len;
2730			if (m->m_flags & M_MCAST)
2731				ss->omcasts++;
2732			ss->opackets++;
2733			tx->info[idx].m = NULL;
2734			map = tx->info[idx].map;
2735			bus_dmamap_unload(tx->dmat, map);
2736			m_freem(m);
2737		}
2738		if (tx->info[idx].flag) {
2739			tx->info[idx].flag = 0;
2740			tx->pkt_done++;
2741		}
2742	}
2743
2744	/* If we have space, clear IFF_OACTIVE to tell the stack that
2745           its OK to send packets */
2746#ifdef IFNET_BUF_RING
2747	flags = &ss->if_drv_flags;
2748#else
2749	flags = &ifp->if_drv_flags;
2750#endif
2751	mtx_lock(&ss->tx.mtx);
2752	if ((*flags) & IFF_DRV_OACTIVE &&
2753	    tx->req - tx->done < (tx->mask + 1)/4) {
2754		*(flags) &= ~IFF_DRV_OACTIVE;
2755		ss->tx.wake++;
2756		mxge_start_locked(ss);
2757	}
2758#ifdef IFNET_BUF_RING
2759	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2760		/* let the NIC stop polling this queue, since there
2761		 * are no more transmits pending */
2762		if (tx->req == tx->done) {
2763			*tx->send_stop = 1;
2764			tx->queue_active = 0;
2765			tx->deactivate++;
2766			wmb();
2767		}
2768	}
2769#endif
2770	mtx_unlock(&ss->tx.mtx);
2771
2772}
2773
2774static struct mxge_media_type mxge_xfp_media_types[] =
2775{
2776	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2777	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2778	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2779	{0,		(1 << 5),	"10GBASE-ER"},
2780	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2781	{0,		(1 << 3),	"10GBASE-SW"},
2782	{0,		(1 << 2),	"10GBASE-LW"},
2783	{0,		(1 << 1),	"10GBASE-EW"},
2784	{0,		(1 << 0),	"Reserved"}
2785};
2786static struct mxge_media_type mxge_sfp_media_types[] =
2787{
2788	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2789	{0,		(1 << 7),	"Reserved"},
2790	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2791	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2792	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2793	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2794};
2795
2796static void
2797mxge_media_set(mxge_softc_t *sc, int media_type)
2798{
2799
2800
2801	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2802		    0, NULL);
2803	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2804	sc->current_media = media_type;
2805	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2806}
2807
2808static void
2809mxge_media_init(mxge_softc_t *sc)
2810{
2811	char *ptr;
2812	int i;
2813
2814	ifmedia_removeall(&sc->media);
2815	mxge_media_set(sc, IFM_AUTO);
2816
2817	/*
2818	 * parse the product code to deterimine the interface type
2819	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2820	 * after the 3rd dash in the driver's cached copy of the
2821	 * EEPROM's product code string.
2822	 */
2823	ptr = sc->product_code_string;
2824	if (ptr == NULL) {
2825		device_printf(sc->dev, "Missing product code\n");
2826		return;
2827	}
2828
2829	for (i = 0; i < 3; i++, ptr++) {
2830		ptr = strchr(ptr, '-');
2831		if (ptr == NULL) {
2832			device_printf(sc->dev,
2833				      "only %d dashes in PC?!?\n", i);
2834			return;
2835		}
2836	}
2837	if (*ptr == 'C' || *(ptr +1) == 'C') {
2838		/* -C is CX4 */
2839		sc->connector = MXGE_CX4;
2840		mxge_media_set(sc, IFM_10G_CX4);
2841	} else if (*ptr == 'Q') {
2842		/* -Q is Quad Ribbon Fiber */
2843		sc->connector = MXGE_QRF;
2844		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2845		/* FreeBSD has no media type for Quad ribbon fiber */
2846	} else if (*ptr == 'R') {
2847		/* -R is XFP */
2848		sc->connector = MXGE_XFP;
2849	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2850		/* -S or -2S is SFP+ */
2851		sc->connector = MXGE_SFP;
2852	} else {
2853		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2854	}
2855}
2856
2857/*
2858 * Determine the media type for a NIC.  Some XFPs will identify
2859 * themselves only when their link is up, so this is initiated via a
2860 * link up interrupt.  However, this can potentially take up to
2861 * several milliseconds, so it is run via the watchdog routine, rather
2862 * than in the interrupt handler itself.
2863 */
2864static void
2865mxge_media_probe(mxge_softc_t *sc)
2866{
2867	mxge_cmd_t cmd;
2868	char *cage_type;
2869
2870	struct mxge_media_type *mxge_media_types = NULL;
2871	int i, err, ms, mxge_media_type_entries;
2872	uint32_t byte;
2873
2874	sc->need_media_probe = 0;
2875
2876	if (sc->connector == MXGE_XFP) {
2877		/* -R is XFP */
2878		mxge_media_types = mxge_xfp_media_types;
2879		mxge_media_type_entries =
2880			sizeof (mxge_xfp_media_types) /
2881			sizeof (mxge_xfp_media_types[0]);
2882		byte = MXGE_XFP_COMPLIANCE_BYTE;
2883		cage_type = "XFP";
2884	} else 	if (sc->connector == MXGE_SFP) {
2885		/* -S or -2S is SFP+ */
2886		mxge_media_types = mxge_sfp_media_types;
2887		mxge_media_type_entries =
2888			sizeof (mxge_sfp_media_types) /
2889			sizeof (mxge_sfp_media_types[0]);
2890		cage_type = "SFP+";
2891		byte = 3;
2892	} else {
2893		/* nothing to do; media type cannot change */
2894		return;
2895	}
2896
2897	/*
2898	 * At this point we know the NIC has an XFP cage, so now we
2899	 * try to determine what is in the cage by using the
2900	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2901	 * register.  We read just one byte, which may take over
2902	 * a millisecond
2903	 */
2904
2905	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2906	cmd.data1 = byte;
2907	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2908	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2909		device_printf(sc->dev, "failed to read XFP\n");
2910	}
2911	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2912		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2913	}
2914	if (err != MXGEFW_CMD_OK) {
2915		return;
2916	}
2917
2918	/* now we wait for the data to be cached */
2919	cmd.data0 = byte;
2920	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2921	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2922		DELAY(1000);
2923		cmd.data0 = byte;
2924		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2925	}
2926	if (err != MXGEFW_CMD_OK) {
2927		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2928			      cage_type, err, ms);
2929		return;
2930	}
2931
2932	if (cmd.data0 == mxge_media_types[0].bitmask) {
2933		if (mxge_verbose)
2934			device_printf(sc->dev, "%s:%s\n", cage_type,
2935				      mxge_media_types[0].name);
2936		if (sc->current_media != mxge_media_types[0].flag) {
2937			mxge_media_init(sc);
2938			mxge_media_set(sc, mxge_media_types[0].flag);
2939		}
2940		return;
2941	}
2942	for (i = 1; i < mxge_media_type_entries; i++) {
2943		if (cmd.data0 & mxge_media_types[i].bitmask) {
2944			if (mxge_verbose)
2945				device_printf(sc->dev, "%s:%s\n",
2946					      cage_type,
2947					      mxge_media_types[i].name);
2948
2949			if (sc->current_media != mxge_media_types[i].flag) {
2950				mxge_media_init(sc);
2951				mxge_media_set(sc, mxge_media_types[i].flag);
2952			}
2953			return;
2954		}
2955	}
2956	if (mxge_verbose)
2957		device_printf(sc->dev, "%s media 0x%x unknown\n",
2958			      cage_type, cmd.data0);
2959
2960	return;
2961}
2962
2963static void
2964mxge_intr(void *arg)
2965{
2966	struct mxge_slice_state *ss = arg;
2967	mxge_softc_t *sc = ss->sc;
2968	mcp_irq_data_t *stats = ss->fw_stats;
2969	mxge_tx_ring_t *tx = &ss->tx;
2970	mxge_rx_done_t *rx_done = &ss->rx_done;
2971	uint32_t send_done_count;
2972	uint8_t valid;
2973
2974
2975#ifndef IFNET_BUF_RING
2976	/* an interrupt on a non-zero slice is implicitly valid
2977	   since MSI-X irqs are not shared */
2978	if (ss != sc->ss) {
2979		mxge_clean_rx_done(ss);
2980		*ss->irq_claim = be32toh(3);
2981		return;
2982	}
2983#endif
2984
2985	/* make sure the DMA has finished */
2986	if (!stats->valid) {
2987		return;
2988	}
2989	valid = stats->valid;
2990
2991	if (sc->legacy_irq) {
2992		/* lower legacy IRQ  */
2993		*sc->irq_deassert = 0;
2994		if (!mxge_deassert_wait)
2995			/* don't wait for conf. that irq is low */
2996			stats->valid = 0;
2997	} else {
2998		stats->valid = 0;
2999	}
3000
3001	/* loop while waiting for legacy irq deassertion */
3002	do {
3003		/* check for transmit completes and receives */
3004		send_done_count = be32toh(stats->send_done_count);
3005		while ((send_done_count != tx->pkt_done) ||
3006		       (rx_done->entry[rx_done->idx].length != 0)) {
3007			if (send_done_count != tx->pkt_done)
3008				mxge_tx_done(ss, (int)send_done_count);
3009			mxge_clean_rx_done(ss);
3010			send_done_count = be32toh(stats->send_done_count);
3011		}
3012		if (sc->legacy_irq && mxge_deassert_wait)
3013			wmb();
3014	} while (*((volatile uint8_t *) &stats->valid));
3015
3016	/* fw link & error stats meaningful only on the first slice */
3017	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3018		if (sc->link_state != stats->link_up) {
3019			sc->link_state = stats->link_up;
3020			if (sc->link_state) {
3021				if_link_state_change(sc->ifp, LINK_STATE_UP);
3022				 sc->ifp->if_baudrate = IF_Gbps(10UL);
3023				if (mxge_verbose)
3024					device_printf(sc->dev, "link up\n");
3025			} else {
3026				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3027				sc->ifp->if_baudrate = 0;
3028				if (mxge_verbose)
3029					device_printf(sc->dev, "link down\n");
3030			}
3031			sc->need_media_probe = 1;
3032		}
3033		if (sc->rdma_tags_available !=
3034		    be32toh(stats->rdma_tags_available)) {
3035			sc->rdma_tags_available =
3036				be32toh(stats->rdma_tags_available);
3037			device_printf(sc->dev, "RDMA timed out! %d tags "
3038				      "left\n", sc->rdma_tags_available);
3039		}
3040
3041		if (stats->link_down) {
3042			sc->down_cnt += stats->link_down;
3043			sc->link_state = 0;
3044			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3045		}
3046	}
3047
3048	/* check to see if we have rx token to pass back */
3049	if (valid & 0x1)
3050	    *ss->irq_claim = be32toh(3);
3051	*(ss->irq_claim + 1) = be32toh(3);
3052}
3053
3054static void
3055mxge_init(void *arg)
3056{
3057	mxge_softc_t *sc = arg;
3058	struct ifnet *ifp = sc->ifp;
3059
3060
3061	mtx_lock(&sc->driver_mtx);
3062	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3063		(void) mxge_open(sc);
3064	mtx_unlock(&sc->driver_mtx);
3065}
3066
3067
3068
3069static void
3070mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3071{
3072	struct lro_entry *lro_entry;
3073	int i;
3074
3075	while (!SLIST_EMPTY(&ss->lro_free)) {
3076		lro_entry = SLIST_FIRST(&ss->lro_free);
3077		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3078		free(lro_entry, M_DEVBUF);
3079	}
3080
3081	for (i = 0; i <= ss->rx_big.mask; i++) {
3082		if (ss->rx_big.info[i].m == NULL)
3083			continue;
3084		bus_dmamap_unload(ss->rx_big.dmat,
3085				  ss->rx_big.info[i].map);
3086		m_freem(ss->rx_big.info[i].m);
3087		ss->rx_big.info[i].m = NULL;
3088	}
3089
3090	for (i = 0; i <= ss->rx_small.mask; i++) {
3091		if (ss->rx_small.info[i].m == NULL)
3092			continue;
3093		bus_dmamap_unload(ss->rx_small.dmat,
3094				  ss->rx_small.info[i].map);
3095		m_freem(ss->rx_small.info[i].m);
3096		ss->rx_small.info[i].m = NULL;
3097	}
3098
3099	/* transmit ring used only on the first slice */
3100	if (ss->tx.info == NULL)
3101		return;
3102
3103	for (i = 0; i <= ss->tx.mask; i++) {
3104		ss->tx.info[i].flag = 0;
3105		if (ss->tx.info[i].m == NULL)
3106			continue;
3107		bus_dmamap_unload(ss->tx.dmat,
3108				  ss->tx.info[i].map);
3109		m_freem(ss->tx.info[i].m);
3110		ss->tx.info[i].m = NULL;
3111	}
3112}
3113
3114static void
3115mxge_free_mbufs(mxge_softc_t *sc)
3116{
3117	int slice;
3118
3119	for (slice = 0; slice < sc->num_slices; slice++)
3120		mxge_free_slice_mbufs(&sc->ss[slice]);
3121}
3122
3123static void
3124mxge_free_slice_rings(struct mxge_slice_state *ss)
3125{
3126	int i;
3127
3128
3129	if (ss->rx_done.entry != NULL)
3130		mxge_dma_free(&ss->rx_done.dma);
3131	ss->rx_done.entry = NULL;
3132
3133	if (ss->tx.req_bytes != NULL)
3134		free(ss->tx.req_bytes, M_DEVBUF);
3135	ss->tx.req_bytes = NULL;
3136
3137	if (ss->tx.seg_list != NULL)
3138		free(ss->tx.seg_list, M_DEVBUF);
3139	ss->tx.seg_list = NULL;
3140
3141	if (ss->rx_small.shadow != NULL)
3142		free(ss->rx_small.shadow, M_DEVBUF);
3143	ss->rx_small.shadow = NULL;
3144
3145	if (ss->rx_big.shadow != NULL)
3146		free(ss->rx_big.shadow, M_DEVBUF);
3147	ss->rx_big.shadow = NULL;
3148
3149	if (ss->tx.info != NULL) {
3150		if (ss->tx.dmat != NULL) {
3151			for (i = 0; i <= ss->tx.mask; i++) {
3152				bus_dmamap_destroy(ss->tx.dmat,
3153						   ss->tx.info[i].map);
3154			}
3155			bus_dma_tag_destroy(ss->tx.dmat);
3156		}
3157		free(ss->tx.info, M_DEVBUF);
3158	}
3159	ss->tx.info = NULL;
3160
3161	if (ss->rx_small.info != NULL) {
3162		if (ss->rx_small.dmat != NULL) {
3163			for (i = 0; i <= ss->rx_small.mask; i++) {
3164				bus_dmamap_destroy(ss->rx_small.dmat,
3165						   ss->rx_small.info[i].map);
3166			}
3167			bus_dmamap_destroy(ss->rx_small.dmat,
3168					   ss->rx_small.extra_map);
3169			bus_dma_tag_destroy(ss->rx_small.dmat);
3170		}
3171		free(ss->rx_small.info, M_DEVBUF);
3172	}
3173	ss->rx_small.info = NULL;
3174
3175	if (ss->rx_big.info != NULL) {
3176		if (ss->rx_big.dmat != NULL) {
3177			for (i = 0; i <= ss->rx_big.mask; i++) {
3178				bus_dmamap_destroy(ss->rx_big.dmat,
3179						   ss->rx_big.info[i].map);
3180			}
3181			bus_dmamap_destroy(ss->rx_big.dmat,
3182					   ss->rx_big.extra_map);
3183			bus_dma_tag_destroy(ss->rx_big.dmat);
3184		}
3185		free(ss->rx_big.info, M_DEVBUF);
3186	}
3187	ss->rx_big.info = NULL;
3188}
3189
3190static void
3191mxge_free_rings(mxge_softc_t *sc)
3192{
3193	int slice;
3194
3195	for (slice = 0; slice < sc->num_slices; slice++)
3196		mxge_free_slice_rings(&sc->ss[slice]);
3197}
3198
3199static int
3200mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3201		       int tx_ring_entries)
3202{
3203	mxge_softc_t *sc = ss->sc;
3204	size_t bytes;
3205	int err, i;
3206
3207	err = ENOMEM;
3208
3209	/* allocate per-slice receive resources */
3210
3211	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3212	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3213
3214	/* allocate the rx shadow rings */
3215	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3216	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3217	if (ss->rx_small.shadow == NULL)
3218		return err;
3219
3220	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3221	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3222	if (ss->rx_big.shadow == NULL)
3223		return err;
3224
3225	/* allocate the rx host info rings */
3226	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3227	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3228	if (ss->rx_small.info == NULL)
3229		return err;
3230
3231	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3232	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3233	if (ss->rx_big.info == NULL)
3234		return err;
3235
3236	/* allocate the rx busdma resources */
3237	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3238				 1,			/* alignment */
3239				 4096,			/* boundary */
3240				 BUS_SPACE_MAXADDR,	/* low */
3241				 BUS_SPACE_MAXADDR,	/* high */
3242				 NULL, NULL,		/* filter */
3243				 MHLEN,			/* maxsize */
3244				 1,			/* num segs */
3245				 MHLEN,			/* maxsegsize */
3246				 BUS_DMA_ALLOCNOW,	/* flags */
3247				 NULL, NULL,		/* lock */
3248				 &ss->rx_small.dmat);	/* tag */
3249	if (err != 0) {
3250		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3251			      err);
3252		return err;
3253	}
3254
3255	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3256				 1,			/* alignment */
3257#if MXGE_VIRT_JUMBOS
3258				 4096,			/* boundary */
3259#else
3260				 0,			/* boundary */
3261#endif
3262				 BUS_SPACE_MAXADDR,	/* low */
3263				 BUS_SPACE_MAXADDR,	/* high */
3264				 NULL, NULL,		/* filter */
3265				 3*4096,		/* maxsize */
3266#if MXGE_VIRT_JUMBOS
3267				 3,			/* num segs */
3268				 4096,			/* maxsegsize*/
3269#else
3270				 1,			/* num segs */
3271				 MJUM9BYTES,		/* maxsegsize*/
3272#endif
3273				 BUS_DMA_ALLOCNOW,	/* flags */
3274				 NULL, NULL,		/* lock */
3275				 &ss->rx_big.dmat);	/* tag */
3276	if (err != 0) {
3277		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3278			      err);
3279		return err;
3280	}
3281	for (i = 0; i <= ss->rx_small.mask; i++) {
3282		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3283					&ss->rx_small.info[i].map);
3284		if (err != 0) {
3285			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3286				      err);
3287			return err;
3288		}
3289	}
3290	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3291				&ss->rx_small.extra_map);
3292	if (err != 0) {
3293		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3294			      err);
3295		return err;
3296	}
3297
3298	for (i = 0; i <= ss->rx_big.mask; i++) {
3299		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3300					&ss->rx_big.info[i].map);
3301		if (err != 0) {
3302			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3303				      err);
3304			return err;
3305		}
3306	}
3307	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3308				&ss->rx_big.extra_map);
3309	if (err != 0) {
3310		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3311			      err);
3312		return err;
3313	}
3314
3315	/* now allocate TX resouces */
3316
3317#ifndef IFNET_BUF_RING
3318	/* only use a single TX ring for now */
3319	if (ss != ss->sc->ss)
3320		return 0;
3321#endif
3322
3323	ss->tx.mask = tx_ring_entries - 1;
3324	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3325
3326
3327	/* allocate the tx request copy block */
3328	bytes = 8 +
3329		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3330	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3331	if (ss->tx.req_bytes == NULL)
3332		return err;
3333	/* ensure req_list entries are aligned to 8 bytes */
3334	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3335		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3336
3337	/* allocate the tx busdma segment list */
3338	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3339	ss->tx.seg_list = (bus_dma_segment_t *)
3340		malloc(bytes, M_DEVBUF, M_WAITOK);
3341	if (ss->tx.seg_list == NULL)
3342		return err;
3343
3344	/* allocate the tx host info ring */
3345	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3346	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3347	if (ss->tx.info == NULL)
3348		return err;
3349
3350	/* allocate the tx busdma resources */
3351	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3352				 1,			/* alignment */
3353				 sc->tx_boundary,	/* boundary */
3354				 BUS_SPACE_MAXADDR,	/* low */
3355				 BUS_SPACE_MAXADDR,	/* high */
3356				 NULL, NULL,		/* filter */
3357				 65536 + 256,		/* maxsize */
3358				 ss->tx.max_desc - 2,	/* num segs */
3359				 sc->tx_boundary,	/* maxsegsz */
3360				 BUS_DMA_ALLOCNOW,	/* flags */
3361				 NULL, NULL,		/* lock */
3362				 &ss->tx.dmat);		/* tag */
3363
3364	if (err != 0) {
3365		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3366			      err);
3367		return err;
3368	}
3369
3370	/* now use these tags to setup dmamaps for each slot
3371	   in the ring */
3372	for (i = 0; i <= ss->tx.mask; i++) {
3373		err = bus_dmamap_create(ss->tx.dmat, 0,
3374					&ss->tx.info[i].map);
3375		if (err != 0) {
3376			device_printf(sc->dev, "Err %d  tx dmamap\n",
3377				      err);
3378			return err;
3379		}
3380	}
3381	return 0;
3382
3383}
3384
3385static int
3386mxge_alloc_rings(mxge_softc_t *sc)
3387{
3388	mxge_cmd_t cmd;
3389	int tx_ring_size;
3390	int tx_ring_entries, rx_ring_entries;
3391	int err, slice;
3392
3393	/* get ring sizes */
3394	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3395	tx_ring_size = cmd.data0;
3396	if (err != 0) {
3397		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3398		goto abort;
3399	}
3400
3401	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3402	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3403	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3404	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3405	IFQ_SET_READY(&sc->ifp->if_snd);
3406
3407	for (slice = 0; slice < sc->num_slices; slice++) {
3408		err = mxge_alloc_slice_rings(&sc->ss[slice],
3409					     rx_ring_entries,
3410					     tx_ring_entries);
3411		if (err != 0)
3412			goto abort;
3413	}
3414	return 0;
3415
3416abort:
3417	mxge_free_rings(sc);
3418	return err;
3419
3420}
3421
3422
3423static void
3424mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3425{
3426	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3427
3428	if (bufsize < MCLBYTES) {
3429		/* easy, everything fits in a single buffer */
3430		*big_buf_size = MCLBYTES;
3431		*cl_size = MCLBYTES;
3432		*nbufs = 1;
3433		return;
3434	}
3435
3436	if (bufsize < MJUMPAGESIZE) {
3437		/* still easy, everything still fits in a single buffer */
3438		*big_buf_size = MJUMPAGESIZE;
3439		*cl_size = MJUMPAGESIZE;
3440		*nbufs = 1;
3441		return;
3442	}
3443#if MXGE_VIRT_JUMBOS
3444	/* now we need to use virtually contiguous buffers */
3445	*cl_size = MJUM9BYTES;
3446	*big_buf_size = 4096;
3447	*nbufs = mtu / 4096 + 1;
3448	/* needs to be a power of two, so round up */
3449	if (*nbufs == 3)
3450		*nbufs = 4;
3451#else
3452	*cl_size = MJUM9BYTES;
3453	*big_buf_size = MJUM9BYTES;
3454	*nbufs = 1;
3455#endif
3456}
3457
3458static int
3459mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3460{
3461	mxge_softc_t *sc;
3462	mxge_cmd_t cmd;
3463	bus_dmamap_t map;
3464	struct lro_entry *lro_entry;
3465	int err, i, slice;
3466
3467
3468	sc = ss->sc;
3469	slice = ss - sc->ss;
3470
3471	SLIST_INIT(&ss->lro_free);
3472	SLIST_INIT(&ss->lro_active);
3473
3474	for (i = 0; i < sc->lro_cnt; i++) {
3475		lro_entry = (struct lro_entry *)
3476			malloc(sizeof (*lro_entry), M_DEVBUF,
3477			       M_NOWAIT | M_ZERO);
3478		if (lro_entry == NULL) {
3479			sc->lro_cnt = i;
3480			break;
3481		}
3482		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3483	}
3484	/* get the lanai pointers to the send and receive rings */
3485
3486	err = 0;
3487#ifndef IFNET_BUF_RING
3488	/* We currently only send from the first slice */
3489	if (slice == 0) {
3490#endif
3491		cmd.data0 = slice;
3492		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3493		ss->tx.lanai =
3494			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3495		ss->tx.send_go = (volatile uint32_t *)
3496			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3497		ss->tx.send_stop = (volatile uint32_t *)
3498		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3499#ifndef IFNET_BUF_RING
3500	}
3501#endif
3502	cmd.data0 = slice;
3503	err |= mxge_send_cmd(sc,
3504			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3505	ss->rx_small.lanai =
3506		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3507	cmd.data0 = slice;
3508	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3509	ss->rx_big.lanai =
3510		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3511
3512	if (err != 0) {
3513		device_printf(sc->dev,
3514			      "failed to get ring sizes or locations\n");
3515		return EIO;
3516	}
3517
3518	/* stock receive rings */
3519	for (i = 0; i <= ss->rx_small.mask; i++) {
3520		map = ss->rx_small.info[i].map;
3521		err = mxge_get_buf_small(ss, map, i);
3522		if (err) {
3523			device_printf(sc->dev, "alloced %d/%d smalls\n",
3524				      i, ss->rx_small.mask + 1);
3525			return ENOMEM;
3526		}
3527	}
3528	for (i = 0; i <= ss->rx_big.mask; i++) {
3529		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3530		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3531	}
3532	ss->rx_big.nbufs = nbufs;
3533	ss->rx_big.cl_size = cl_size;
3534	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3535		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3536	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3537		map = ss->rx_big.info[i].map;
3538		err = mxge_get_buf_big(ss, map, i);
3539		if (err) {
3540			device_printf(sc->dev, "alloced %d/%d bigs\n",
3541				      i, ss->rx_big.mask + 1);
3542			return ENOMEM;
3543		}
3544	}
3545	return 0;
3546}
3547
3548static int
3549mxge_open(mxge_softc_t *sc)
3550{
3551	mxge_cmd_t cmd;
3552	int err, big_bytes, nbufs, slice, cl_size, i;
3553	bus_addr_t bus;
3554	volatile uint8_t *itable;
3555	struct mxge_slice_state *ss;
3556
3557	/* Copy the MAC address in case it was overridden */
3558	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3559
3560	err = mxge_reset(sc, 1);
3561	if (err != 0) {
3562		device_printf(sc->dev, "failed to reset\n");
3563		return EIO;
3564	}
3565
3566	if (sc->num_slices > 1) {
3567		/* setup the indirection table */
3568		cmd.data0 = sc->num_slices;
3569		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3570				    &cmd);
3571
3572		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3573				     &cmd);
3574		if (err != 0) {
3575			device_printf(sc->dev,
3576				      "failed to setup rss tables\n");
3577			return err;
3578		}
3579
3580		/* just enable an identity mapping */
3581		itable = sc->sram + cmd.data0;
3582		for (i = 0; i < sc->num_slices; i++)
3583			itable[i] = (uint8_t)i;
3584
3585		cmd.data0 = 1;
3586		cmd.data1 = mxge_rss_hash_type;
3587		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3588		if (err != 0) {
3589			device_printf(sc->dev, "failed to enable slices\n");
3590			return err;
3591		}
3592	}
3593
3594
3595	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3596
3597	cmd.data0 = nbufs;
3598	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3599			    &cmd);
3600	/* error is only meaningful if we're trying to set
3601	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3602	if (err && nbufs > 1) {
3603		device_printf(sc->dev,
3604			      "Failed to set alway-use-n to %d\n",
3605			      nbufs);
3606		return EIO;
3607	}
3608	/* Give the firmware the mtu and the big and small buffer
3609	   sizes.  The firmware wants the big buf size to be a power
3610	   of two. Luckily, FreeBSD's clusters are powers of two */
3611	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3612	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3613	cmd.data0 = MHLEN - MXGEFW_PAD;
3614	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3615			     &cmd);
3616	cmd.data0 = big_bytes;
3617	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3618
3619	if (err != 0) {
3620		device_printf(sc->dev, "failed to setup params\n");
3621		goto abort;
3622	}
3623
3624	/* Now give him the pointer to the stats block */
3625	for (slice = 0;
3626#ifdef IFNET_BUF_RING
3627	     slice < sc->num_slices;
3628#else
3629	     slice < 1;
3630#endif
3631	     slice++) {
3632		ss = &sc->ss[slice];
3633		cmd.data0 =
3634			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3635		cmd.data1 =
3636			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3637		cmd.data2 = sizeof(struct mcp_irq_data);
3638		cmd.data2 |= (slice << 16);
3639		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3640	}
3641
3642	if (err != 0) {
3643		bus = sc->ss->fw_stats_dma.bus_addr;
3644		bus += offsetof(struct mcp_irq_data, send_done_count);
3645		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3646		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3647		err = mxge_send_cmd(sc,
3648				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3649				    &cmd);
3650		/* Firmware cannot support multicast without STATS_DMA_V2 */
3651		sc->fw_multicast_support = 0;
3652	} else {
3653		sc->fw_multicast_support = 1;
3654	}
3655
3656	if (err != 0) {
3657		device_printf(sc->dev, "failed to setup params\n");
3658		goto abort;
3659	}
3660
3661	for (slice = 0; slice < sc->num_slices; slice++) {
3662		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3663		if (err != 0) {
3664			device_printf(sc->dev, "couldn't open slice %d\n",
3665				      slice);
3666			goto abort;
3667		}
3668	}
3669
3670	/* Finally, start the firmware running */
3671	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3672	if (err) {
3673		device_printf(sc->dev, "Couldn't bring up link\n");
3674		goto abort;
3675	}
3676#ifdef IFNET_BUF_RING
3677	for (slice = 0; slice < sc->num_slices; slice++) {
3678		ss = &sc->ss[slice];
3679		ss->if_drv_flags |= IFF_DRV_RUNNING;
3680		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3681	}
3682#endif
3683	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3684	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3685
3686	return 0;
3687
3688
3689abort:
3690	mxge_free_mbufs(sc);
3691
3692	return err;
3693}
3694
3695static int
3696mxge_close(mxge_softc_t *sc, int down)
3697{
3698	mxge_cmd_t cmd;
3699	int err, old_down_cnt;
3700#ifdef IFNET_BUF_RING
3701	struct mxge_slice_state *ss;
3702	int slice;
3703#endif
3704
3705#ifdef IFNET_BUF_RING
3706	for (slice = 0; slice < sc->num_slices; slice++) {
3707		ss = &sc->ss[slice];
3708		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3709	}
3710#endif
3711	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3712	if (!down) {
3713		old_down_cnt = sc->down_cnt;
3714		wmb();
3715		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3716		if (err) {
3717			device_printf(sc->dev,
3718				      "Couldn't bring down link\n");
3719		}
3720		if (old_down_cnt == sc->down_cnt) {
3721			/* wait for down irq */
3722			DELAY(10 * sc->intr_coal_delay);
3723		}
3724		wmb();
3725		if (old_down_cnt == sc->down_cnt) {
3726			device_printf(sc->dev, "never got down irq\n");
3727		}
3728	}
3729	mxge_free_mbufs(sc);
3730
3731	return 0;
3732}
3733
3734static void
3735mxge_setup_cfg_space(mxge_softc_t *sc)
3736{
3737	device_t dev = sc->dev;
3738	int reg;
3739	uint16_t cmd, lnk, pectl;
3740
3741	/* find the PCIe link width and set max read request to 4KB*/
3742	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3743		lnk = pci_read_config(dev, reg + 0x12, 2);
3744		sc->link_width = (lnk >> 4) & 0x3f;
3745
3746		if (sc->pectl == 0) {
3747			pectl = pci_read_config(dev, reg + 0x8, 2);
3748			pectl = (pectl & ~0x7000) | (5 << 12);
3749			pci_write_config(dev, reg + 0x8, pectl, 2);
3750			sc->pectl = pectl;
3751		} else {
3752			/* restore saved pectl after watchdog reset */
3753			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3754		}
3755	}
3756
3757	/* Enable DMA and Memory space access */
3758	pci_enable_busmaster(dev);
3759	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3760	cmd |= PCIM_CMD_MEMEN;
3761	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3762}
3763
3764static uint32_t
3765mxge_read_reboot(mxge_softc_t *sc)
3766{
3767	device_t dev = sc->dev;
3768	uint32_t vs;
3769
3770	/* find the vendor specific offset */
3771	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3772		device_printf(sc->dev,
3773			      "could not find vendor specific offset\n");
3774		return (uint32_t)-1;
3775	}
3776	/* enable read32 mode */
3777	pci_write_config(dev, vs + 0x10, 0x3, 1);
3778	/* tell NIC which register to read */
3779	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3780	return (pci_read_config(dev, vs + 0x14, 4));
3781}
3782
3783static void
3784mxge_watchdog_reset(mxge_softc_t *sc)
3785{
3786	struct pci_devinfo *dinfo;
3787	struct mxge_slice_state *ss;
3788	int err, running, s, num_tx_slices = 1;
3789	uint32_t reboot;
3790	uint16_t cmd;
3791
3792	err = ENXIO;
3793
3794	device_printf(sc->dev, "Watchdog reset!\n");
3795
3796	/*
3797	 * check to see if the NIC rebooted.  If it did, then all of
3798	 * PCI config space has been reset, and things like the
3799	 * busmaster bit will be zero.  If this is the case, then we
3800	 * must restore PCI config space before the NIC can be used
3801	 * again
3802	 */
3803	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3804	if (cmd == 0xffff) {
3805		/*
3806		 * maybe the watchdog caught the NIC rebooting; wait
3807		 * up to 100ms for it to finish.  If it does not come
3808		 * back, then give up
3809		 */
3810		DELAY(1000*100);
3811		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3812		if (cmd == 0xffff) {
3813			device_printf(sc->dev, "NIC disappeared!\n");
3814		}
3815	}
3816	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3817		/* print the reboot status */
3818		reboot = mxge_read_reboot(sc);
3819		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3820			      reboot);
3821		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3822		if (running) {
3823
3824			/*
3825			 * quiesce NIC so that TX routines will not try to
3826			 * xmit after restoration of BAR
3827			 */
3828
3829			/* Mark the link as down */
3830			if (sc->link_state) {
3831				sc->link_state = 0;
3832				if_link_state_change(sc->ifp,
3833						     LINK_STATE_DOWN);
3834			}
3835#ifdef IFNET_BUF_RING
3836			num_tx_slices = sc->num_slices;
3837#endif
3838			/* grab all TX locks to ensure no tx  */
3839			for (s = 0; s < num_tx_slices; s++) {
3840				ss = &sc->ss[s];
3841				mtx_lock(&ss->tx.mtx);
3842			}
3843			mxge_close(sc, 1);
3844		}
3845		/* restore PCI configuration space */
3846		dinfo = device_get_ivars(sc->dev);
3847		pci_cfg_restore(sc->dev, dinfo);
3848
3849		/* and redo any changes we made to our config space */
3850		mxge_setup_cfg_space(sc);
3851
3852		/* reload f/w */
3853		err = mxge_load_firmware(sc, 0);
3854		if (err) {
3855			device_printf(sc->dev,
3856				      "Unable to re-load f/w\n");
3857		}
3858		if (running) {
3859			if (!err)
3860				err = mxge_open(sc);
3861			/* release all TX locks */
3862			for (s = 0; s < num_tx_slices; s++) {
3863				ss = &sc->ss[s];
3864#ifdef IFNET_BUF_RING
3865				mxge_start_locked(ss);
3866#endif
3867				mtx_unlock(&ss->tx.mtx);
3868			}
3869		}
3870		sc->watchdog_resets++;
3871	} else {
3872		device_printf(sc->dev,
3873			      "NIC did not reboot, not resetting\n");
3874		err = 0;
3875	}
3876	if (err) {
3877		device_printf(sc->dev, "watchdog reset failed\n");
3878	} else {
3879		if (sc->dying == 2)
3880			sc->dying = 0;
3881		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3882	}
3883}
3884
3885static void
3886mxge_watchdog_task(void *arg, int pending)
3887{
3888	mxge_softc_t *sc = arg;
3889
3890
3891	mtx_lock(&sc->driver_mtx);
3892	mxge_watchdog_reset(sc);
3893	mtx_unlock(&sc->driver_mtx);
3894}
3895
3896static void
3897mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3898{
3899	tx = &sc->ss[slice].tx;
3900	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3901	device_printf(sc->dev,
3902		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3903		      tx->req, tx->done, tx->queue_active);
3904	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3905			      tx->activate, tx->deactivate);
3906	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3907		      tx->pkt_done,
3908		      be32toh(sc->ss->fw_stats->send_done_count));
3909}
3910
3911static int
3912mxge_watchdog(mxge_softc_t *sc)
3913{
3914	mxge_tx_ring_t *tx;
3915	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3916	int i, err = 0;
3917
3918	/* see if we have outstanding transmits, which
3919	   have been pending for more than mxge_ticks */
3920	for (i = 0;
3921#ifdef IFNET_BUF_RING
3922	     (i < sc->num_slices) && (err == 0);
3923#else
3924	     (i < 1) && (err == 0);
3925#endif
3926	     i++) {
3927		tx = &sc->ss[i].tx;
3928		if (tx->req != tx->done &&
3929		    tx->watchdog_req != tx->watchdog_done &&
3930		    tx->done == tx->watchdog_done) {
3931			/* check for pause blocking before resetting */
3932			if (tx->watchdog_rx_pause == rx_pause) {
3933				mxge_warn_stuck(sc, tx, i);
3934				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3935				return (ENXIO);
3936			}
3937			else
3938				device_printf(sc->dev, "Flow control blocking "
3939					      "xmits, check link partner\n");
3940		}
3941
3942		tx->watchdog_req = tx->req;
3943		tx->watchdog_done = tx->done;
3944		tx->watchdog_rx_pause = rx_pause;
3945	}
3946
3947	if (sc->need_media_probe)
3948		mxge_media_probe(sc);
3949	return (err);
3950}
3951
3952static u_long
3953mxge_update_stats(mxge_softc_t *sc)
3954{
3955	struct mxge_slice_state *ss;
3956	u_long pkts = 0;
3957	u_long ipackets = 0;
3958	u_long opackets = 0;
3959#ifdef IFNET_BUF_RING
3960	u_long obytes = 0;
3961	u_long omcasts = 0;
3962	u_long odrops = 0;
3963#endif
3964	u_long oerrors = 0;
3965	int slice;
3966
3967	for (slice = 0; slice < sc->num_slices; slice++) {
3968		ss = &sc->ss[slice];
3969		ipackets += ss->ipackets;
3970		opackets += ss->opackets;
3971#ifdef IFNET_BUF_RING
3972		obytes += ss->obytes;
3973		omcasts += ss->omcasts;
3974		odrops += ss->tx.br->br_drops;
3975#endif
3976		oerrors += ss->oerrors;
3977	}
3978	pkts = (ipackets - sc->ifp->if_ipackets);
3979	pkts += (opackets - sc->ifp->if_opackets);
3980	sc->ifp->if_ipackets = ipackets;
3981	sc->ifp->if_opackets = opackets;
3982#ifdef IFNET_BUF_RING
3983	sc->ifp->if_obytes = obytes;
3984	sc->ifp->if_omcasts = omcasts;
3985	sc->ifp->if_snd.ifq_drops = odrops;
3986#endif
3987	sc->ifp->if_oerrors = oerrors;
3988	return pkts;
3989}
3990
3991static void
3992mxge_tick(void *arg)
3993{
3994	mxge_softc_t *sc = arg;
3995	u_long pkts = 0;
3996	int err = 0;
3997	int running, ticks;
3998	uint16_t cmd;
3999
4000	ticks = mxge_ticks;
4001	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4002	if (running) {
4003		/* aggregate stats from different slices */
4004		pkts = mxge_update_stats(sc);
4005		if (!sc->watchdog_countdown) {
4006			err = mxge_watchdog(sc);
4007			sc->watchdog_countdown = 4;
4008		}
4009		sc->watchdog_countdown--;
4010	}
4011	if (pkts == 0) {
4012		/* ensure NIC did not suffer h/w fault while idle */
4013		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4014		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4015			sc->dying = 2;
4016			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4017			err = ENXIO;
4018		}
4019		/* look less often if NIC is idle */
4020		ticks *= 4;
4021	}
4022
4023	if (err == 0)
4024		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4025
4026}
4027
4028static int
4029mxge_media_change(struct ifnet *ifp)
4030{
4031	return EINVAL;
4032}
4033
4034static int
4035mxge_change_mtu(mxge_softc_t *sc, int mtu)
4036{
4037	struct ifnet *ifp = sc->ifp;
4038	int real_mtu, old_mtu;
4039	int err = 0;
4040
4041
4042	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4043	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4044		return EINVAL;
4045	mtx_lock(&sc->driver_mtx);
4046	old_mtu = ifp->if_mtu;
4047	ifp->if_mtu = mtu;
4048	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4049		mxge_close(sc, 0);
4050		err = mxge_open(sc);
4051		if (err != 0) {
4052			ifp->if_mtu = old_mtu;
4053			mxge_close(sc, 0);
4054			(void) mxge_open(sc);
4055		}
4056	}
4057	mtx_unlock(&sc->driver_mtx);
4058	return err;
4059}
4060
4061static void
4062mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4063{
4064	mxge_softc_t *sc = ifp->if_softc;
4065
4066
4067	if (sc == NULL)
4068		return;
4069	ifmr->ifm_status = IFM_AVALID;
4070	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4071	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4072	ifmr->ifm_active |= sc->current_media;
4073}
4074
4075static int
4076mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4077{
4078	mxge_softc_t *sc = ifp->if_softc;
4079	struct ifreq *ifr = (struct ifreq *)data;
4080	int err, mask;
4081
4082	err = 0;
4083	switch (command) {
4084	case SIOCSIFADDR:
4085	case SIOCGIFADDR:
4086		err = ether_ioctl(ifp, command, data);
4087		break;
4088
4089	case SIOCSIFMTU:
4090		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4091		break;
4092
4093	case SIOCSIFFLAGS:
4094		mtx_lock(&sc->driver_mtx);
4095		if (sc->dying) {
4096			mtx_unlock(&sc->driver_mtx);
4097			return EINVAL;
4098		}
4099		if (ifp->if_flags & IFF_UP) {
4100			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4101				err = mxge_open(sc);
4102			} else {
4103				/* take care of promis can allmulti
4104				   flag chages */
4105				mxge_change_promisc(sc,
4106						    ifp->if_flags & IFF_PROMISC);
4107				mxge_set_multicast_list(sc);
4108			}
4109		} else {
4110			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4111				mxge_close(sc, 0);
4112			}
4113		}
4114		mtx_unlock(&sc->driver_mtx);
4115		break;
4116
4117	case SIOCADDMULTI:
4118	case SIOCDELMULTI:
4119		mtx_lock(&sc->driver_mtx);
4120		mxge_set_multicast_list(sc);
4121		mtx_unlock(&sc->driver_mtx);
4122		break;
4123
4124	case SIOCSIFCAP:
4125		mtx_lock(&sc->driver_mtx);
4126		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4127		if (mask & IFCAP_TXCSUM) {
4128			if (IFCAP_TXCSUM & ifp->if_capenable) {
4129				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4130				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4131						      | CSUM_TSO);
4132			} else {
4133				ifp->if_capenable |= IFCAP_TXCSUM;
4134				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4135			}
4136		} else if (mask & IFCAP_RXCSUM) {
4137			if (IFCAP_RXCSUM & ifp->if_capenable) {
4138				ifp->if_capenable &= ~IFCAP_RXCSUM;
4139				sc->csum_flag = 0;
4140			} else {
4141				ifp->if_capenable |= IFCAP_RXCSUM;
4142				sc->csum_flag = 1;
4143			}
4144		}
4145		if (mask & IFCAP_TSO4) {
4146			if (IFCAP_TSO4 & ifp->if_capenable) {
4147				ifp->if_capenable &= ~IFCAP_TSO4;
4148				ifp->if_hwassist &= ~CSUM_TSO;
4149			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4150				ifp->if_capenable |= IFCAP_TSO4;
4151				ifp->if_hwassist |= CSUM_TSO;
4152			} else {
4153				printf("mxge requires tx checksum offload"
4154				       " be enabled to use TSO\n");
4155				err = EINVAL;
4156			}
4157		}
4158		if (mask & IFCAP_LRO) {
4159			if (IFCAP_LRO & ifp->if_capenable)
4160				err = mxge_change_lro_locked(sc, 0);
4161			else
4162				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4163		}
4164		if (mask & IFCAP_VLAN_HWTAGGING)
4165			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4166		if (mask & IFCAP_VLAN_HWTSO)
4167			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4168
4169		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4170		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4171			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4172
4173		mtx_unlock(&sc->driver_mtx);
4174		VLAN_CAPABILITIES(ifp);
4175
4176		break;
4177
4178	case SIOCGIFMEDIA:
4179		mtx_lock(&sc->driver_mtx);
4180		mxge_media_probe(sc);
4181		mtx_unlock(&sc->driver_mtx);
4182		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4183				    &sc->media, command);
4184                break;
4185
4186	default:
4187		err = ENOTTY;
4188        }
4189	return err;
4190}
4191
4192static void
4193mxge_fetch_tunables(mxge_softc_t *sc)
4194{
4195
4196	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4197	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4198			  &mxge_flow_control);
4199	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4200			  &mxge_intr_coal_delay);
4201	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4202			  &mxge_nvidia_ecrc_enable);
4203	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4204			  &mxge_force_firmware);
4205	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4206			  &mxge_deassert_wait);
4207	TUNABLE_INT_FETCH("hw.mxge.verbose",
4208			  &mxge_verbose);
4209	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4210	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4211	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4212	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4213	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4214	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4215	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4216	if (sc->lro_cnt != 0)
4217		mxge_lro_cnt = sc->lro_cnt;
4218
4219	if (bootverbose)
4220		mxge_verbose = 1;
4221	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4222		mxge_intr_coal_delay = 30;
4223	if (mxge_ticks == 0)
4224		mxge_ticks = hz / 2;
4225	sc->pause = mxge_flow_control;
4226	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4227	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4228		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4229	}
4230	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4231	    mxge_initial_mtu < ETHER_MIN_LEN)
4232		mxge_initial_mtu = ETHERMTU_JUMBO;
4233
4234	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4235		mxge_throttle = MXGE_MAX_THROTTLE;
4236	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4237		mxge_throttle = MXGE_MIN_THROTTLE;
4238	sc->throttle = mxge_throttle;
4239}
4240
4241
4242static void
4243mxge_free_slices(mxge_softc_t *sc)
4244{
4245	struct mxge_slice_state *ss;
4246	int i;
4247
4248
4249	if (sc->ss == NULL)
4250		return;
4251
4252	for (i = 0; i < sc->num_slices; i++) {
4253		ss = &sc->ss[i];
4254		if (ss->fw_stats != NULL) {
4255			mxge_dma_free(&ss->fw_stats_dma);
4256			ss->fw_stats = NULL;
4257#ifdef IFNET_BUF_RING
4258			if (ss->tx.br != NULL) {
4259				drbr_free(ss->tx.br, M_DEVBUF);
4260				ss->tx.br = NULL;
4261			}
4262#endif
4263			mtx_destroy(&ss->tx.mtx);
4264		}
4265		if (ss->rx_done.entry != NULL) {
4266			mxge_dma_free(&ss->rx_done.dma);
4267			ss->rx_done.entry = NULL;
4268		}
4269	}
4270	free(sc->ss, M_DEVBUF);
4271	sc->ss = NULL;
4272}
4273
4274static int
4275mxge_alloc_slices(mxge_softc_t *sc)
4276{
4277	mxge_cmd_t cmd;
4278	struct mxge_slice_state *ss;
4279	size_t bytes;
4280	int err, i, max_intr_slots;
4281
4282	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4283	if (err != 0) {
4284		device_printf(sc->dev, "Cannot determine rx ring size\n");
4285		return err;
4286	}
4287	sc->rx_ring_size = cmd.data0;
4288	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4289
4290	bytes = sizeof (*sc->ss) * sc->num_slices;
4291	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4292	if (sc->ss == NULL)
4293		return (ENOMEM);
4294	for (i = 0; i < sc->num_slices; i++) {
4295		ss = &sc->ss[i];
4296
4297		ss->sc = sc;
4298
4299		/* allocate per-slice rx interrupt queues */
4300
4301		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4302		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4303		if (err != 0)
4304			goto abort;
4305		ss->rx_done.entry = ss->rx_done.dma.addr;
4306		bzero(ss->rx_done.entry, bytes);
4307
4308		/*
4309		 * allocate the per-slice firmware stats; stats
4310		 * (including tx) are used used only on the first
4311		 * slice for now
4312		 */
4313#ifndef IFNET_BUF_RING
4314		if (i > 0)
4315			continue;
4316#endif
4317
4318		bytes = sizeof (*ss->fw_stats);
4319		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4320				     sizeof (*ss->fw_stats), 64);
4321		if (err != 0)
4322			goto abort;
4323		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4324		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4325			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4326		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4327#ifdef IFNET_BUF_RING
4328		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4329					   &ss->tx.mtx);
4330#endif
4331	}
4332
4333	return (0);
4334
4335abort:
4336	mxge_free_slices(sc);
4337	return (ENOMEM);
4338}
4339
4340static void
4341mxge_slice_probe(mxge_softc_t *sc)
4342{
4343	mxge_cmd_t cmd;
4344	char *old_fw;
4345	int msix_cnt, status, max_intr_slots;
4346
4347	sc->num_slices = 1;
4348	/*
4349	 *  don't enable multiple slices if they are not enabled,
4350	 *  or if this is not an SMP system
4351	 */
4352
4353	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4354		return;
4355
4356	/* see how many MSI-X interrupts are available */
4357	msix_cnt = pci_msix_count(sc->dev);
4358	if (msix_cnt < 2)
4359		return;
4360
4361	/* now load the slice aware firmware see what it supports */
4362	old_fw = sc->fw_name;
4363	if (old_fw == mxge_fw_aligned)
4364		sc->fw_name = mxge_fw_rss_aligned;
4365	else
4366		sc->fw_name = mxge_fw_rss_unaligned;
4367	status = mxge_load_firmware(sc, 0);
4368	if (status != 0) {
4369		device_printf(sc->dev, "Falling back to a single slice\n");
4370		return;
4371	}
4372
4373	/* try to send a reset command to the card to see if it
4374	   is alive */
4375	memset(&cmd, 0, sizeof (cmd));
4376	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4377	if (status != 0) {
4378		device_printf(sc->dev, "failed reset\n");
4379		goto abort_with_fw;
4380	}
4381
4382	/* get rx ring size */
4383	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4384	if (status != 0) {
4385		device_printf(sc->dev, "Cannot determine rx ring size\n");
4386		goto abort_with_fw;
4387	}
4388	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4389
4390	/* tell it the size of the interrupt queues */
4391	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4392	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4393	if (status != 0) {
4394		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4395		goto abort_with_fw;
4396	}
4397
4398	/* ask the maximum number of slices it supports */
4399	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4400	if (status != 0) {
4401		device_printf(sc->dev,
4402			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4403		goto abort_with_fw;
4404	}
4405	sc->num_slices = cmd.data0;
4406	if (sc->num_slices > msix_cnt)
4407		sc->num_slices = msix_cnt;
4408
4409	if (mxge_max_slices == -1) {
4410		/* cap to number of CPUs in system */
4411		if (sc->num_slices > mp_ncpus)
4412			sc->num_slices = mp_ncpus;
4413	} else {
4414		if (sc->num_slices > mxge_max_slices)
4415			sc->num_slices = mxge_max_slices;
4416	}
4417	/* make sure it is a power of two */
4418	while (sc->num_slices & (sc->num_slices - 1))
4419		sc->num_slices--;
4420
4421	if (mxge_verbose)
4422		device_printf(sc->dev, "using %d slices\n",
4423			      sc->num_slices);
4424
4425	return;
4426
4427abort_with_fw:
4428	sc->fw_name = old_fw;
4429	(void) mxge_load_firmware(sc, 0);
4430}
4431
4432static int
4433mxge_add_msix_irqs(mxge_softc_t *sc)
4434{
4435	size_t bytes;
4436	int count, err, i, rid;
4437
4438	rid = PCIR_BAR(2);
4439	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4440						    &rid, RF_ACTIVE);
4441
4442	if (sc->msix_table_res == NULL) {
4443		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4444		return ENXIO;
4445	}
4446
4447	count = sc->num_slices;
4448	err = pci_alloc_msix(sc->dev, &count);
4449	if (err != 0) {
4450		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4451			      "err = %d \n", sc->num_slices, err);
4452		goto abort_with_msix_table;
4453	}
4454	if (count < sc->num_slices) {
4455		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4456			      count, sc->num_slices);
4457		device_printf(sc->dev,
4458			      "Try setting hw.mxge.max_slices to %d\n",
4459			      count);
4460		err = ENOSPC;
4461		goto abort_with_msix;
4462	}
4463	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4464	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4465	if (sc->msix_irq_res == NULL) {
4466		err = ENOMEM;
4467		goto abort_with_msix;
4468	}
4469
4470	for (i = 0; i < sc->num_slices; i++) {
4471		rid = i + 1;
4472		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4473							  SYS_RES_IRQ,
4474							  &rid, RF_ACTIVE);
4475		if (sc->msix_irq_res[i] == NULL) {
4476			device_printf(sc->dev, "couldn't allocate IRQ res"
4477				      " for message %d\n", i);
4478			err = ENXIO;
4479			goto abort_with_res;
4480		}
4481	}
4482
4483	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4484	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4485
4486	for (i = 0; i < sc->num_slices; i++) {
4487		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4488				     INTR_TYPE_NET | INTR_MPSAFE,
4489#if __FreeBSD_version > 700030
4490				     NULL,
4491#endif
4492				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4493		if (err != 0) {
4494			device_printf(sc->dev, "couldn't setup intr for "
4495				      "message %d\n", i);
4496			goto abort_with_intr;
4497		}
4498		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4499				  sc->msix_ih[i], "s%d", i);
4500	}
4501
4502	if (mxge_verbose) {
4503		device_printf(sc->dev, "using %d msix IRQs:",
4504			      sc->num_slices);
4505		for (i = 0; i < sc->num_slices; i++)
4506			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4507		printf("\n");
4508	}
4509	return (0);
4510
4511abort_with_intr:
4512	for (i = 0; i < sc->num_slices; i++) {
4513		if (sc->msix_ih[i] != NULL) {
4514			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4515					  sc->msix_ih[i]);
4516			sc->msix_ih[i] = NULL;
4517		}
4518	}
4519	free(sc->msix_ih, M_DEVBUF);
4520
4521
4522abort_with_res:
4523	for (i = 0; i < sc->num_slices; i++) {
4524		rid = i + 1;
4525		if (sc->msix_irq_res[i] != NULL)
4526			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4527					     sc->msix_irq_res[i]);
4528		sc->msix_irq_res[i] = NULL;
4529	}
4530	free(sc->msix_irq_res, M_DEVBUF);
4531
4532
4533abort_with_msix:
4534	pci_release_msi(sc->dev);
4535
4536abort_with_msix_table:
4537	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4538			     sc->msix_table_res);
4539
4540	return err;
4541}
4542
4543static int
4544mxge_add_single_irq(mxge_softc_t *sc)
4545{
4546	int count, err, rid;
4547
4548	count = pci_msi_count(sc->dev);
4549	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4550		rid = 1;
4551	} else {
4552		rid = 0;
4553		sc->legacy_irq = 1;
4554	}
4555	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4556					 1, RF_SHAREABLE | RF_ACTIVE);
4557	if (sc->irq_res == NULL) {
4558		device_printf(sc->dev, "could not alloc interrupt\n");
4559		return ENXIO;
4560	}
4561	if (mxge_verbose)
4562		device_printf(sc->dev, "using %s irq %ld\n",
4563			      sc->legacy_irq ? "INTx" : "MSI",
4564			      rman_get_start(sc->irq_res));
4565	err = bus_setup_intr(sc->dev, sc->irq_res,
4566			     INTR_TYPE_NET | INTR_MPSAFE,
4567#if __FreeBSD_version > 700030
4568			     NULL,
4569#endif
4570			     mxge_intr, &sc->ss[0], &sc->ih);
4571	if (err != 0) {
4572		bus_release_resource(sc->dev, SYS_RES_IRQ,
4573				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4574		if (!sc->legacy_irq)
4575			pci_release_msi(sc->dev);
4576	}
4577	return err;
4578}
4579
4580static void
4581mxge_rem_msix_irqs(mxge_softc_t *sc)
4582{
4583	int i, rid;
4584
4585	for (i = 0; i < sc->num_slices; i++) {
4586		if (sc->msix_ih[i] != NULL) {
4587			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4588					  sc->msix_ih[i]);
4589			sc->msix_ih[i] = NULL;
4590		}
4591	}
4592	free(sc->msix_ih, M_DEVBUF);
4593
4594	for (i = 0; i < sc->num_slices; i++) {
4595		rid = i + 1;
4596		if (sc->msix_irq_res[i] != NULL)
4597			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4598					     sc->msix_irq_res[i]);
4599		sc->msix_irq_res[i] = NULL;
4600	}
4601	free(sc->msix_irq_res, M_DEVBUF);
4602
4603	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4604			     sc->msix_table_res);
4605
4606	pci_release_msi(sc->dev);
4607	return;
4608}
4609
4610static void
4611mxge_rem_single_irq(mxge_softc_t *sc)
4612{
4613	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4614	bus_release_resource(sc->dev, SYS_RES_IRQ,
4615			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4616	if (!sc->legacy_irq)
4617		pci_release_msi(sc->dev);
4618}
4619
4620static void
4621mxge_rem_irq(mxge_softc_t *sc)
4622{
4623	if (sc->num_slices > 1)
4624		mxge_rem_msix_irqs(sc);
4625	else
4626		mxge_rem_single_irq(sc);
4627}
4628
4629static int
4630mxge_add_irq(mxge_softc_t *sc)
4631{
4632	int err;
4633
4634	if (sc->num_slices > 1)
4635		err = mxge_add_msix_irqs(sc);
4636	else
4637		err = mxge_add_single_irq(sc);
4638
4639	if (0 && err == 0 && sc->num_slices > 1) {
4640		mxge_rem_msix_irqs(sc);
4641		err = mxge_add_msix_irqs(sc);
4642	}
4643	return err;
4644}
4645
4646
4647static int
4648mxge_attach(device_t dev)
4649{
4650	mxge_softc_t *sc = device_get_softc(dev);
4651	struct ifnet *ifp;
4652	int err, rid;
4653
4654	sc->dev = dev;
4655	mxge_fetch_tunables(sc);
4656
4657	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4658	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4659				  taskqueue_thread_enqueue, &sc->tq);
4660	if (sc->tq == NULL) {
4661		err = ENOMEM;
4662		goto abort_with_nothing;
4663	}
4664
4665	err = bus_dma_tag_create(NULL,			/* parent */
4666				 1,			/* alignment */
4667				 0,			/* boundary */
4668				 BUS_SPACE_MAXADDR,	/* low */
4669				 BUS_SPACE_MAXADDR,	/* high */
4670				 NULL, NULL,		/* filter */
4671				 65536 + 256,		/* maxsize */
4672				 MXGE_MAX_SEND_DESC, 	/* num segs */
4673				 65536,			/* maxsegsize */
4674				 0,			/* flags */
4675				 NULL, NULL,		/* lock */
4676				 &sc->parent_dmat);	/* tag */
4677
4678	if (err != 0) {
4679		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4680			      err);
4681		goto abort_with_tq;
4682	}
4683
4684	ifp = sc->ifp = if_alloc(IFT_ETHER);
4685	if (ifp == NULL) {
4686		device_printf(dev, "can not if_alloc()\n");
4687		err = ENOSPC;
4688		goto abort_with_parent_dmat;
4689	}
4690	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4691
4692	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4693		 device_get_nameunit(dev));
4694	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4695	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4696		 "%s:drv", device_get_nameunit(dev));
4697	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4698		 MTX_NETWORK_LOCK, MTX_DEF);
4699
4700	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4701
4702	mxge_setup_cfg_space(sc);
4703
4704	/* Map the board into the kernel */
4705	rid = PCIR_BARS;
4706	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4707					 ~0, 1, RF_ACTIVE);
4708	if (sc->mem_res == NULL) {
4709		device_printf(dev, "could not map memory\n");
4710		err = ENXIO;
4711		goto abort_with_lock;
4712	}
4713	sc->sram = rman_get_virtual(sc->mem_res);
4714	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4715	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4716		device_printf(dev, "impossible memory region size %ld\n",
4717			      rman_get_size(sc->mem_res));
4718		err = ENXIO;
4719		goto abort_with_mem_res;
4720	}
4721
4722	/* make NULL terminated copy of the EEPROM strings section of
4723	   lanai SRAM */
4724	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4725	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4726				rman_get_bushandle(sc->mem_res),
4727				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4728				sc->eeprom_strings,
4729				MXGE_EEPROM_STRINGS_SIZE - 2);
4730	err = mxge_parse_strings(sc);
4731	if (err != 0)
4732		goto abort_with_mem_res;
4733
4734	/* Enable write combining for efficient use of PCIe bus */
4735	mxge_enable_wc(sc);
4736
4737	/* Allocate the out of band dma memory */
4738	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4739			     sizeof (mxge_cmd_t), 64);
4740	if (err != 0)
4741		goto abort_with_mem_res;
4742	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4743	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4744	if (err != 0)
4745		goto abort_with_cmd_dma;
4746
4747	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4748	if (err != 0)
4749		goto abort_with_zeropad_dma;
4750
4751	/* select & load the firmware */
4752	err = mxge_select_firmware(sc);
4753	if (err != 0)
4754		goto abort_with_dmabench;
4755	sc->intr_coal_delay = mxge_intr_coal_delay;
4756
4757	mxge_slice_probe(sc);
4758	err = mxge_alloc_slices(sc);
4759	if (err != 0)
4760		goto abort_with_dmabench;
4761
4762	err = mxge_reset(sc, 0);
4763	if (err != 0)
4764		goto abort_with_slices;
4765
4766	err = mxge_alloc_rings(sc);
4767	if (err != 0) {
4768		device_printf(sc->dev, "failed to allocate rings\n");
4769		goto abort_with_slices;
4770	}
4771
4772	err = mxge_add_irq(sc);
4773	if (err != 0) {
4774		device_printf(sc->dev, "failed to add irq\n");
4775		goto abort_with_rings;
4776	}
4777
4778	ifp->if_baudrate = IF_Gbps(10UL);
4779	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4780		IFCAP_VLAN_MTU | IFCAP_LINKSTATE;
4781#ifdef INET
4782	ifp->if_capabilities |= IFCAP_LRO;
4783#endif
4784
4785#ifdef MXGE_NEW_VLAN_API
4786	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4787
4788	/* Only FW 1.4.32 and newer can do TSO over vlans */
4789	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4790	    sc->fw_ver_tiny >= 32)
4791		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4792#endif
4793
4794	sc->max_mtu = mxge_max_mtu(sc);
4795	if (sc->max_mtu >= 9000)
4796		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4797	else
4798		device_printf(dev, "MTU limited to %d.  Install "
4799			      "latest firmware for 9000 byte jumbo support\n",
4800			      sc->max_mtu - ETHER_HDR_LEN);
4801	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4802	ifp->if_capenable = ifp->if_capabilities;
4803	if (sc->lro_cnt == 0)
4804		ifp->if_capenable &= ~IFCAP_LRO;
4805	sc->csum_flag = 1;
4806        ifp->if_init = mxge_init;
4807        ifp->if_softc = sc;
4808        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4809        ifp->if_ioctl = mxge_ioctl;
4810        ifp->if_start = mxge_start;
4811	/* Initialise the ifmedia structure */
4812	ifmedia_init(&sc->media, 0, mxge_media_change,
4813		     mxge_media_status);
4814	mxge_media_init(sc);
4815	mxge_media_probe(sc);
4816	sc->dying = 0;
4817	ether_ifattach(ifp, sc->mac_addr);
4818	/* ether_ifattach sets mtu to ETHERMTU */
4819	if (mxge_initial_mtu != ETHERMTU)
4820		mxge_change_mtu(sc, mxge_initial_mtu);
4821
4822	mxge_add_sysctls(sc);
4823#ifdef IFNET_BUF_RING
4824	ifp->if_transmit = mxge_transmit;
4825	ifp->if_qflush = mxge_qflush;
4826#endif
4827	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4828				device_get_nameunit(sc->dev));
4829	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4830	return 0;
4831
4832abort_with_rings:
4833	mxge_free_rings(sc);
4834abort_with_slices:
4835	mxge_free_slices(sc);
4836abort_with_dmabench:
4837	mxge_dma_free(&sc->dmabench_dma);
4838abort_with_zeropad_dma:
4839	mxge_dma_free(&sc->zeropad_dma);
4840abort_with_cmd_dma:
4841	mxge_dma_free(&sc->cmd_dma);
4842abort_with_mem_res:
4843	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4844abort_with_lock:
4845	pci_disable_busmaster(dev);
4846	mtx_destroy(&sc->cmd_mtx);
4847	mtx_destroy(&sc->driver_mtx);
4848	if_free(ifp);
4849abort_with_parent_dmat:
4850	bus_dma_tag_destroy(sc->parent_dmat);
4851abort_with_tq:
4852	if (sc->tq != NULL) {
4853		taskqueue_drain(sc->tq, &sc->watchdog_task);
4854		taskqueue_free(sc->tq);
4855		sc->tq = NULL;
4856	}
4857abort_with_nothing:
4858	return err;
4859}
4860
4861static int
4862mxge_detach(device_t dev)
4863{
4864	mxge_softc_t *sc = device_get_softc(dev);
4865
4866	if (mxge_vlans_active(sc)) {
4867		device_printf(sc->dev,
4868			      "Detach vlans before removing module\n");
4869		return EBUSY;
4870	}
4871	mtx_lock(&sc->driver_mtx);
4872	sc->dying = 1;
4873	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4874		mxge_close(sc, 0);
4875	mtx_unlock(&sc->driver_mtx);
4876	ether_ifdetach(sc->ifp);
4877	if (sc->tq != NULL) {
4878		taskqueue_drain(sc->tq, &sc->watchdog_task);
4879		taskqueue_free(sc->tq);
4880		sc->tq = NULL;
4881	}
4882	callout_drain(&sc->co_hdl);
4883	ifmedia_removeall(&sc->media);
4884	mxge_dummy_rdma(sc, 0);
4885	mxge_rem_sysctls(sc);
4886	mxge_rem_irq(sc);
4887	mxge_free_rings(sc);
4888	mxge_free_slices(sc);
4889	mxge_dma_free(&sc->dmabench_dma);
4890	mxge_dma_free(&sc->zeropad_dma);
4891	mxge_dma_free(&sc->cmd_dma);
4892	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4893	pci_disable_busmaster(dev);
4894	mtx_destroy(&sc->cmd_mtx);
4895	mtx_destroy(&sc->driver_mtx);
4896	if_free(sc->ifp);
4897	bus_dma_tag_destroy(sc->parent_dmat);
4898	return 0;
4899}
4900
4901static int
4902mxge_shutdown(device_t dev)
4903{
4904	return 0;
4905}
4906
4907/*
4908  This file uses Myri10GE driver indentation.
4909
4910  Local Variables:
4911  c-file-style:"linux"
4912  tab-width:8
4913  End:
4914*/
4915