1/******************************************************************************
2SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3
4Copyright (c) 2006-2013, Myricom Inc.
5All rights reserved.
6
7Redistribution and use in source and binary forms, with or without
8modification, are permitted provided that the following conditions are met:
9
10 1. Redistributions of source code must retain the above copyright notice,
11    this list of conditions and the following disclaimer.
12
13 2. Neither the name of the Myricom Inc, nor the names of its
14    contributors may be used to endorse or promote products derived from
15    this software without specific prior written permission.
16
17THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27POSSIBILITY OF SUCH DAMAGE.
28
29***************************************************************************/
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD$");
33
34#include <sys/param.h>
35#include <sys/systm.h>
36#include <sys/linker.h>
37#include <sys/firmware.h>
38#include <sys/endian.h>
39#include <sys/sockio.h>
40#include <sys/mbuf.h>
41#include <sys/malloc.h>
42#include <sys/kdb.h>
43#include <sys/kernel.h>
44#include <sys/lock.h>
45#include <sys/module.h>
46#include <sys/socket.h>
47#include <sys/sysctl.h>
48#include <sys/sx.h>
49#include <sys/taskqueue.h>
50#include <sys/zlib.h>
51
52#include <net/if.h>
53#include <net/if_var.h>
54#include <net/if_arp.h>
55#include <net/ethernet.h>
56#include <net/if_dl.h>
57#include <net/if_media.h>
58
59#include <net/bpf.h>
60
61#include <net/if_types.h>
62#include <net/if_vlan_var.h>
63
64#include <netinet/in_systm.h>
65#include <netinet/in.h>
66#include <netinet/ip.h>
67#include <netinet/ip6.h>
68#include <netinet/tcp.h>
69#include <netinet/tcp_lro.h>
70#include <netinet6/ip6_var.h>
71
72#include <machine/bus.h>
73#include <machine/in_cksum.h>
74#include <machine/resource.h>
75#include <sys/bus.h>
76#include <sys/rman.h>
77#include <sys/smp.h>
78
79#include <dev/pci/pcireg.h>
80#include <dev/pci/pcivar.h>
81#include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
82
83#include <vm/vm.h>		/* for pmap_mapdev() */
84#include <vm/pmap.h>
85
86#if defined(__i386) || defined(__amd64)
87#include <machine/specialreg.h>
88#endif
89
90#include <dev/mxge/mxge_mcp.h>
91#include <dev/mxge/mcp_gen_header.h>
92/*#define MXGE_FAKE_IFP*/
93#include <dev/mxge/if_mxge_var.h>
94#ifdef IFNET_BUF_RING
95#include <sys/buf_ring.h>
96#endif
97
98#include "opt_inet.h"
99#include "opt_inet6.h"
100
101/* tunable params */
102static int mxge_nvidia_ecrc_enable = 1;
103static int mxge_force_firmware = 0;
104static int mxge_intr_coal_delay = 30;
105static int mxge_deassert_wait = 1;
106static int mxge_flow_control = 1;
107static int mxge_verbose = 0;
108static int mxge_ticks;
109static int mxge_max_slices = 1;
110static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
111static int mxge_always_promisc = 0;
112static int mxge_initial_mtu = ETHERMTU_JUMBO;
113static int mxge_throttle = 0;
114static char *mxge_fw_unaligned = "mxge_ethp_z8e";
115static char *mxge_fw_aligned = "mxge_eth_z8e";
116static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
117static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
118
119static int mxge_probe(device_t dev);
120static int mxge_attach(device_t dev);
121static int mxge_detach(device_t dev);
122static int mxge_shutdown(device_t dev);
123static void mxge_intr(void *arg);
124
125static device_method_t mxge_methods[] =
126{
127  /* Device interface */
128  DEVMETHOD(device_probe, mxge_probe),
129  DEVMETHOD(device_attach, mxge_attach),
130  DEVMETHOD(device_detach, mxge_detach),
131  DEVMETHOD(device_shutdown, mxge_shutdown),
132
133  DEVMETHOD_END
134};
135
136static driver_t mxge_driver =
137{
138  "mxge",
139  mxge_methods,
140  sizeof(mxge_softc_t),
141};
142
143static devclass_t mxge_devclass;
144
145/* Declare ourselves to be a child of the PCI bus.*/
146DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
147MODULE_DEPEND(mxge, firmware, 1, 1, 1);
148MODULE_DEPEND(mxge, zlib, 1, 1, 1);
149
150static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
151static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
152static int mxge_close(mxge_softc_t *sc, int down);
153static int mxge_open(mxge_softc_t *sc);
154static void mxge_tick(void *arg);
155
156static int
157mxge_probe(device_t dev)
158{
159	int rev;
160
161
162	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
163	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
164	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
165		rev = pci_get_revid(dev);
166		switch (rev) {
167		case MXGE_PCI_REV_Z8E:
168			device_set_desc(dev, "Myri10G-PCIE-8A");
169			break;
170		case MXGE_PCI_REV_Z8ES:
171			device_set_desc(dev, "Myri10G-PCIE-8B");
172			break;
173		default:
174			device_set_desc(dev, "Myri10G-PCIE-8??");
175			device_printf(dev, "Unrecognized rev %d NIC\n",
176				      rev);
177			break;
178		}
179		return 0;
180	}
181	return ENXIO;
182}
183
184static void
185mxge_enable_wc(mxge_softc_t *sc)
186{
187#if defined(__i386) || defined(__amd64)
188	vm_offset_t len;
189	int err;
190
191	sc->wc = 1;
192	len = rman_get_size(sc->mem_res);
193	err = pmap_change_attr((vm_offset_t) sc->sram,
194			       len, PAT_WRITE_COMBINING);
195	if (err != 0) {
196		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
197			      err);
198		sc->wc = 0;
199	}
200#endif
201}
202
203
204/* callback to get our DMA address */
205static void
206mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
207			 int error)
208{
209	if (error == 0) {
210		*(bus_addr_t *) arg = segs->ds_addr;
211	}
212}
213
214static int
215mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
216		   bus_size_t alignment)
217{
218	int err;
219	device_t dev = sc->dev;
220	bus_size_t boundary, maxsegsize;
221
222	if (bytes > 4096 && alignment == 4096) {
223		boundary = 0;
224		maxsegsize = bytes;
225	} else {
226		boundary = 4096;
227		maxsegsize = 4096;
228	}
229
230	/* allocate DMAable memory tags */
231	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
232				 alignment,		/* alignment */
233				 boundary,		/* boundary */
234				 BUS_SPACE_MAXADDR,	/* low */
235				 BUS_SPACE_MAXADDR,	/* high */
236				 NULL, NULL,		/* filter */
237				 bytes,			/* maxsize */
238				 1,			/* num segs */
239				 maxsegsize,		/* maxsegsize */
240				 BUS_DMA_COHERENT,	/* flags */
241				 NULL, NULL,		/* lock */
242				 &dma->dmat);		/* tag */
243	if (err != 0) {
244		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
245		return err;
246	}
247
248	/* allocate DMAable memory & map */
249	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
250			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
251				| BUS_DMA_ZERO),  &dma->map);
252	if (err != 0) {
253		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
254		goto abort_with_dmat;
255	}
256
257	/* load the memory */
258	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
259			      mxge_dmamap_callback,
260			      (void *)&dma->bus_addr, 0);
261	if (err != 0) {
262		device_printf(dev, "couldn't load map (err = %d)\n", err);
263		goto abort_with_mem;
264	}
265	return 0;
266
267abort_with_mem:
268	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
269abort_with_dmat:
270	(void)bus_dma_tag_destroy(dma->dmat);
271	return err;
272}
273
274
275static void
276mxge_dma_free(mxge_dma_t *dma)
277{
278	bus_dmamap_unload(dma->dmat, dma->map);
279	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
280	(void)bus_dma_tag_destroy(dma->dmat);
281}
282
283/*
284 * The eeprom strings on the lanaiX have the format
285 * SN=x\0
286 * MAC=x:x:x:x:x:x\0
287 * PC=text\0
288 */
289
290static int
291mxge_parse_strings(mxge_softc_t *sc)
292{
293	char *ptr;
294	int i, found_mac, found_sn2;
295	char *endptr;
296
297	ptr = sc->eeprom_strings;
298	found_mac = 0;
299	found_sn2 = 0;
300	while (*ptr != '\0') {
301		if (strncmp(ptr, "MAC=", 4) == 0) {
302			ptr += 4;
303			for (i = 0;;) {
304				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
305				if (endptr - ptr != 2)
306					goto abort;
307				ptr = endptr;
308				if (++i == 6)
309					break;
310				if (*ptr++ != ':')
311					goto abort;
312			}
313			found_mac = 1;
314		} else if (strncmp(ptr, "PC=", 3) == 0) {
315			ptr += 3;
316			strlcpy(sc->product_code_string, ptr,
317			    sizeof(sc->product_code_string));
318		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
319			ptr += 3;
320			strlcpy(sc->serial_number_string, ptr,
321			    sizeof(sc->serial_number_string));
322		} else if (strncmp(ptr, "SN2=", 4) == 0) {
323			/* SN2 takes precedence over SN */
324			ptr += 4;
325			found_sn2 = 1;
326			strlcpy(sc->serial_number_string, ptr,
327			    sizeof(sc->serial_number_string));
328		}
329		while (*ptr++ != '\0') {}
330	}
331
332	if (found_mac)
333		return 0;
334
335 abort:
336	device_printf(sc->dev, "failed to parse eeprom_strings\n");
337
338	return ENXIO;
339}
340
341#if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
342static void
343mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
344{
345	uint32_t val;
346	unsigned long base, off;
347	char *va, *cfgptr;
348	device_t pdev, mcp55;
349	uint16_t vendor_id, device_id, word;
350	uintptr_t bus, slot, func, ivend, idev;
351	uint32_t *ptr32;
352
353
354	if (!mxge_nvidia_ecrc_enable)
355		return;
356
357	pdev = device_get_parent(device_get_parent(sc->dev));
358	if (pdev == NULL) {
359		device_printf(sc->dev, "could not find parent?\n");
360		return;
361	}
362	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
363	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
364
365	if (vendor_id != 0x10de)
366		return;
367
368	base = 0;
369
370	if (device_id == 0x005d) {
371		/* ck804, base address is magic */
372		base = 0xe0000000UL;
373	} else if (device_id >= 0x0374 && device_id <= 0x378) {
374		/* mcp55, base address stored in chipset */
375		mcp55 = pci_find_bsf(0, 0, 0);
376		if (mcp55 &&
377		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
378		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
379			word = pci_read_config(mcp55, 0x90, 2);
380			base = ((unsigned long)word & 0x7ffeU) << 25;
381		}
382	}
383	if (!base)
384		return;
385
386	/* XXXX
387	   Test below is commented because it is believed that doing
388	   config read/write beyond 0xff will access the config space
389	   for the next larger function.  Uncomment this and remove
390	   the hacky pmap_mapdev() way of accessing config space when
391	   FreeBSD grows support for extended pcie config space access
392	*/
393#if 0
394	/* See if we can, by some miracle, access the extended
395	   config space */
396	val = pci_read_config(pdev, 0x178, 4);
397	if (val != 0xffffffff) {
398		val |= 0x40;
399		pci_write_config(pdev, 0x178, val, 4);
400		return;
401	}
402#endif
403	/* Rather than using normal pci config space writes, we must
404	 * map the Nvidia config space ourselves.  This is because on
405	 * opteron/nvidia class machine the 0xe000000 mapping is
406	 * handled by the nvidia chipset, that means the internal PCI
407	 * device (the on-chip northbridge), or the amd-8131 bridge
408	 * and things behind them are not visible by this method.
409	 */
410
411	BUS_READ_IVAR(device_get_parent(pdev), pdev,
412		      PCI_IVAR_BUS, &bus);
413	BUS_READ_IVAR(device_get_parent(pdev), pdev,
414		      PCI_IVAR_SLOT, &slot);
415	BUS_READ_IVAR(device_get_parent(pdev), pdev,
416		      PCI_IVAR_FUNCTION, &func);
417	BUS_READ_IVAR(device_get_parent(pdev), pdev,
418		      PCI_IVAR_VENDOR, &ivend);
419	BUS_READ_IVAR(device_get_parent(pdev), pdev,
420		      PCI_IVAR_DEVICE, &idev);
421
422	off =  base
423		+ 0x00100000UL * (unsigned long)bus
424		+ 0x00001000UL * (unsigned long)(func
425						 + 8 * slot);
426
427	/* map it into the kernel */
428	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
429
430
431	if (va == NULL) {
432		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
433		return;
434	}
435	/* get a pointer to the config space mapped into the kernel */
436	cfgptr = va + (off & PAGE_MASK);
437
438	/* make sure that we can really access it */
439	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
440	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
441	if (! (vendor_id == ivend && device_id == idev)) {
442		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
443			      vendor_id, device_id);
444		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
445		return;
446	}
447
448	ptr32 = (uint32_t*)(cfgptr + 0x178);
449	val = *ptr32;
450
451	if (val == 0xffffffff) {
452		device_printf(sc->dev, "extended mapping failed\n");
453		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
454		return;
455	}
456	*ptr32 = val | 0x40;
457	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
458	if (mxge_verbose)
459		device_printf(sc->dev,
460			      "Enabled ECRC on upstream Nvidia bridge "
461			      "at %d:%d:%d\n",
462			      (int)bus, (int)slot, (int)func);
463	return;
464}
465#else
466static void
467mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
468{
469	device_printf(sc->dev,
470		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
471	return;
472}
473#endif
474
475
476static int
477mxge_dma_test(mxge_softc_t *sc, int test_type)
478{
479	mxge_cmd_t cmd;
480	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
481	int status;
482	uint32_t len;
483	char *test = " ";
484
485
486	/* Run a small DMA test.
487	 * The magic multipliers to the length tell the firmware
488	 * to do DMA read, write, or read+write tests.  The
489	 * results are returned in cmd.data0.  The upper 16
490	 * bits of the return is the number of transfers completed.
491	 * The lower 16 bits is the time in 0.5us ticks that the
492	 * transfers took to complete.
493	 */
494
495	len = sc->tx_boundary;
496
497	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
498	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
499	cmd.data2 = len * 0x10000;
500	status = mxge_send_cmd(sc, test_type, &cmd);
501	if (status != 0) {
502		test = "read";
503		goto abort;
504	}
505	sc->read_dma = ((cmd.data0>>16) * len * 2) /
506		(cmd.data0 & 0xffff);
507	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
508	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
509	cmd.data2 = len * 0x1;
510	status = mxge_send_cmd(sc, test_type, &cmd);
511	if (status != 0) {
512		test = "write";
513		goto abort;
514	}
515	sc->write_dma = ((cmd.data0>>16) * len * 2) /
516		(cmd.data0 & 0xffff);
517
518	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
519	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
520	cmd.data2 = len * 0x10001;
521	status = mxge_send_cmd(sc, test_type, &cmd);
522	if (status != 0) {
523		test = "read/write";
524		goto abort;
525	}
526	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
527		(cmd.data0 & 0xffff);
528
529abort:
530	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
531		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
532			      test, status);
533
534	return status;
535}
536
537/*
538 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
539 * when the PCI-E Completion packets are aligned on an 8-byte
540 * boundary.  Some PCI-E chip sets always align Completion packets; on
541 * the ones that do not, the alignment can be enforced by enabling
542 * ECRC generation (if supported).
543 *
544 * When PCI-E Completion packets are not aligned, it is actually more
545 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
546 *
547 * If the driver can neither enable ECRC nor verify that it has
548 * already been enabled, then it must use a firmware image which works
549 * around unaligned completion packets (ethp_z8e.dat), and it should
550 * also ensure that it never gives the device a Read-DMA which is
551 * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
552 * enabled, then the driver should use the aligned (eth_z8e.dat)
553 * firmware image, and set tx_boundary to 4KB.
554 */
555
556static int
557mxge_firmware_probe(mxge_softc_t *sc)
558{
559	device_t dev = sc->dev;
560	int reg, status;
561	uint16_t pectl;
562
563	sc->tx_boundary = 4096;
564	/*
565	 * Verify the max read request size was set to 4KB
566	 * before trying the test with 4KB.
567	 */
568	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
569		pectl = pci_read_config(dev, reg + 0x8, 2);
570		if ((pectl & (5 << 12)) != (5 << 12)) {
571			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
572				      pectl);
573			sc->tx_boundary = 2048;
574		}
575	}
576
577	/*
578	 * load the optimized firmware (which assumes aligned PCIe
579	 * completions) in order to see if it works on this host.
580	 */
581	sc->fw_name = mxge_fw_aligned;
582	status = mxge_load_firmware(sc, 1);
583	if (status != 0) {
584		return status;
585	}
586
587	/*
588	 * Enable ECRC if possible
589	 */
590	mxge_enable_nvidia_ecrc(sc);
591
592	/*
593	 * Run a DMA test which watches for unaligned completions and
594	 * aborts on the first one seen.  Not required on Z8ES or newer.
595	 */
596	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
597		return 0;
598	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
599	if (status == 0)
600		return 0; /* keep the aligned firmware */
601
602	if (status != E2BIG)
603		device_printf(dev, "DMA test failed: %d\n", status);
604	if (status == ENOSYS)
605		device_printf(dev, "Falling back to ethp! "
606			      "Please install up to date fw\n");
607	return status;
608}
609
610static int
611mxge_select_firmware(mxge_softc_t *sc)
612{
613	int aligned = 0;
614	int force_firmware = mxge_force_firmware;
615
616	if (sc->throttle)
617		force_firmware = sc->throttle;
618
619	if (force_firmware != 0) {
620		if (force_firmware == 1)
621			aligned = 1;
622		else
623			aligned = 0;
624		if (mxge_verbose)
625			device_printf(sc->dev,
626				      "Assuming %s completions (forced)\n",
627				      aligned ? "aligned" : "unaligned");
628		goto abort;
629	}
630
631	/* if the PCIe link width is 4 or less, we can use the aligned
632	   firmware and skip any checks */
633	if (sc->link_width != 0 && sc->link_width <= 4) {
634		device_printf(sc->dev,
635			      "PCIe x%d Link, expect reduced performance\n",
636			      sc->link_width);
637		aligned = 1;
638		goto abort;
639	}
640
641	if (0 == mxge_firmware_probe(sc))
642		return 0;
643
644abort:
645	if (aligned) {
646		sc->fw_name = mxge_fw_aligned;
647		sc->tx_boundary = 4096;
648	} else {
649		sc->fw_name = mxge_fw_unaligned;
650		sc->tx_boundary = 2048;
651	}
652	return (mxge_load_firmware(sc, 0));
653}
654
655static int
656mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
657{
658
659
660	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
661		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
662			      be32toh(hdr->mcp_type));
663		return EIO;
664	}
665
666	/* save firmware version for sysctl */
667	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
668	if (mxge_verbose)
669		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
670
671	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
672	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
673
674	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
675	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
676		device_printf(sc->dev, "Found firmware version %s\n",
677			      sc->fw_version);
678		device_printf(sc->dev, "Driver needs %d.%d\n",
679			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
680		return EINVAL;
681	}
682	return 0;
683
684}
685
686static void *
687z_alloc(void *nil, u_int items, u_int size)
688{
689	void *ptr;
690
691	ptr = malloc(items * size, M_TEMP, M_NOWAIT);
692	return ptr;
693}
694
695static void
696z_free(void *nil, void *ptr)
697{
698	free(ptr, M_TEMP);
699}
700
701
702static int
703mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
704{
705	z_stream zs;
706	char *inflate_buffer;
707	const struct firmware *fw;
708	const mcp_gen_header_t *hdr;
709	unsigned hdr_offset;
710	int status;
711	unsigned int i;
712	char dummy;
713	size_t fw_len;
714
715	fw = firmware_get(sc->fw_name);
716	if (fw == NULL) {
717		device_printf(sc->dev, "Could not find firmware image %s\n",
718			      sc->fw_name);
719		return ENOENT;
720	}
721
722
723
724	/* setup zlib and decompress f/w */
725	bzero(&zs, sizeof (zs));
726	zs.zalloc = z_alloc;
727	zs.zfree = z_free;
728	status = inflateInit(&zs);
729	if (status != Z_OK) {
730		status = EIO;
731		goto abort_with_fw;
732	}
733
734	/* the uncompressed size is stored as the firmware version,
735	   which would otherwise go unused */
736	fw_len = (size_t) fw->version;
737	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
738	if (inflate_buffer == NULL)
739		goto abort_with_zs;
740	zs.avail_in = fw->datasize;
741	zs.next_in = __DECONST(char *, fw->data);
742	zs.avail_out = fw_len;
743	zs.next_out = inflate_buffer;
744	status = inflate(&zs, Z_FINISH);
745	if (status != Z_STREAM_END) {
746		device_printf(sc->dev, "zlib %d\n", status);
747		status = EIO;
748		goto abort_with_buffer;
749	}
750
751	/* check id */
752	hdr_offset = htobe32(*(const uint32_t *)
753			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
754	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
755		device_printf(sc->dev, "Bad firmware file");
756		status = EIO;
757		goto abort_with_buffer;
758	}
759	hdr = (const void*)(inflate_buffer + hdr_offset);
760
761	status = mxge_validate_firmware(sc, hdr);
762	if (status != 0)
763		goto abort_with_buffer;
764
765	/* Copy the inflated firmware to NIC SRAM. */
766	for (i = 0; i < fw_len; i += 256) {
767		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
768			      inflate_buffer + i,
769			      min(256U, (unsigned)(fw_len - i)));
770		wmb();
771		dummy = *sc->sram;
772		wmb();
773	}
774
775	*limit = fw_len;
776	status = 0;
777abort_with_buffer:
778	free(inflate_buffer, M_TEMP);
779abort_with_zs:
780	inflateEnd(&zs);
781abort_with_fw:
782	firmware_put(fw, FIRMWARE_UNLOAD);
783	return status;
784}
785
786/*
787 * Enable or disable periodic RDMAs from the host to make certain
788 * chipsets resend dropped PCIe messages
789 */
790
791static void
792mxge_dummy_rdma(mxge_softc_t *sc, int enable)
793{
794	char buf_bytes[72];
795	volatile uint32_t *confirm;
796	volatile char *submit;
797	uint32_t *buf, dma_low, dma_high;
798	int i;
799
800	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
801
802	/* clear confirmation addr */
803	confirm = (volatile uint32_t *)sc->cmd;
804	*confirm = 0;
805	wmb();
806
807	/* send an rdma command to the PCIe engine, and wait for the
808	   response in the confirmation address.  The firmware should
809	   write a -1 there to indicate it is alive and well
810	*/
811
812	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
813	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
814	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
815	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
816	buf[2] = htobe32(0xffffffff);		/* confirm data */
817	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
818	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
819	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
820	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
821	buf[5] = htobe32(enable);			/* enable? */
822
823
824	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
825
826	mxge_pio_copy(submit, buf, 64);
827	wmb();
828	DELAY(1000);
829	wmb();
830	i = 0;
831	while (*confirm != 0xffffffff && i < 20) {
832		DELAY(1000);
833		i++;
834	}
835	if (*confirm != 0xffffffff) {
836		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
837			      (enable ? "enable" : "disable"), confirm,
838			      *confirm);
839	}
840	return;
841}
842
843static int
844mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
845{
846	mcp_cmd_t *buf;
847	char buf_bytes[sizeof(*buf) + 8];
848	volatile mcp_cmd_response_t *response = sc->cmd;
849	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
850	uint32_t dma_low, dma_high;
851	int err, sleep_total = 0;
852
853	/* ensure buf is aligned to 8 bytes */
854	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
855
856	buf->data0 = htobe32(data->data0);
857	buf->data1 = htobe32(data->data1);
858	buf->data2 = htobe32(data->data2);
859	buf->cmd = htobe32(cmd);
860	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
861	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
862
863	buf->response_addr.low = htobe32(dma_low);
864	buf->response_addr.high = htobe32(dma_high);
865	mtx_lock(&sc->cmd_mtx);
866	response->result = 0xffffffff;
867	wmb();
868	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
869
870	/* wait up to 20ms */
871	err = EAGAIN;
872	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
873		bus_dmamap_sync(sc->cmd_dma.dmat,
874				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
875		wmb();
876		switch (be32toh(response->result)) {
877		case 0:
878			data->data0 = be32toh(response->data);
879			err = 0;
880			break;
881		case 0xffffffff:
882			DELAY(1000);
883			break;
884		case MXGEFW_CMD_UNKNOWN:
885			err = ENOSYS;
886			break;
887		case MXGEFW_CMD_ERROR_UNALIGNED:
888			err = E2BIG;
889			break;
890		case MXGEFW_CMD_ERROR_BUSY:
891			err = EBUSY;
892			break;
893		case MXGEFW_CMD_ERROR_I2C_ABSENT:
894			err = ENXIO;
895			break;
896		default:
897			device_printf(sc->dev,
898				      "mxge: command %d "
899				      "failed, result = %d\n",
900				      cmd, be32toh(response->result));
901			err = ENXIO;
902			break;
903		}
904		if (err != EAGAIN)
905			break;
906	}
907	if (err == EAGAIN)
908		device_printf(sc->dev, "mxge: command %d timed out"
909			      "result = %d\n",
910			      cmd, be32toh(response->result));
911	mtx_unlock(&sc->cmd_mtx);
912	return err;
913}
914
915static int
916mxge_adopt_running_firmware(mxge_softc_t *sc)
917{
918	struct mcp_gen_header *hdr;
919	const size_t bytes = sizeof (struct mcp_gen_header);
920	size_t hdr_offset;
921	int status;
922
923	/* find running firmware header */
924	hdr_offset = htobe32(*(volatile uint32_t *)
925			     (sc->sram + MCP_HEADER_PTR_OFFSET));
926
927	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
928		device_printf(sc->dev,
929			      "Running firmware has bad header offset (%d)\n",
930			      (int)hdr_offset);
931		return EIO;
932	}
933
934	/* copy header of running firmware from SRAM to host memory to
935	 * validate firmware */
936	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
937	if (hdr == NULL) {
938		device_printf(sc->dev, "could not malloc firmware hdr\n");
939		return ENOMEM;
940	}
941	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
942				rman_get_bushandle(sc->mem_res),
943				hdr_offset, (char *)hdr, bytes);
944	status = mxge_validate_firmware(sc, hdr);
945	free(hdr, M_DEVBUF);
946
947	/*
948	 * check to see if adopted firmware has bug where adopting
949	 * it will cause broadcasts to be filtered unless the NIC
950	 * is kept in ALLMULTI mode
951	 */
952	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
953	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
954		sc->adopted_rx_filter_bug = 1;
955		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
956			      "working around rx filter bug\n",
957			      sc->fw_ver_major, sc->fw_ver_minor,
958			      sc->fw_ver_tiny);
959	}
960
961	return status;
962}
963
964
965static int
966mxge_load_firmware(mxge_softc_t *sc, int adopt)
967{
968	volatile uint32_t *confirm;
969	volatile char *submit;
970	char buf_bytes[72];
971	uint32_t *buf, size, dma_low, dma_high;
972	int status, i;
973
974	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
975
976	size = sc->sram_size;
977	status = mxge_load_firmware_helper(sc, &size);
978	if (status) {
979		if (!adopt)
980			return status;
981		/* Try to use the currently running firmware, if
982		   it is new enough */
983		status = mxge_adopt_running_firmware(sc);
984		if (status) {
985			device_printf(sc->dev,
986				      "failed to adopt running firmware\n");
987			return status;
988		}
989		device_printf(sc->dev,
990			      "Successfully adopted running firmware\n");
991		if (sc->tx_boundary == 4096) {
992			device_printf(sc->dev,
993				"Using firmware currently running on NIC"
994				 ".  For optimal\n");
995			device_printf(sc->dev,
996				 "performance consider loading optimized "
997				 "firmware\n");
998		}
999		sc->fw_name = mxge_fw_unaligned;
1000		sc->tx_boundary = 2048;
1001		return 0;
1002	}
1003	/* clear confirmation addr */
1004	confirm = (volatile uint32_t *)sc->cmd;
1005	*confirm = 0;
1006	wmb();
1007	/* send a reload command to the bootstrap MCP, and wait for the
1008	   response in the confirmation address.  The firmware should
1009	   write a -1 there to indicate it is alive and well
1010	*/
1011
1012	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1013	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1014
1015	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1016	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1017	buf[2] = htobe32(0xffffffff);	/* confirm data */
1018
1019	/* FIX: All newest firmware should un-protect the bottom of
1020	   the sram before handoff. However, the very first interfaces
1021	   do not. Therefore the handoff copy must skip the first 8 bytes
1022	*/
1023					/* where the code starts*/
1024	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1025	buf[4] = htobe32(size - 8); 	/* length of code */
1026	buf[5] = htobe32(8);		/* where to copy to */
1027	buf[6] = htobe32(0);		/* where to jump to */
1028
1029	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1030	mxge_pio_copy(submit, buf, 64);
1031	wmb();
1032	DELAY(1000);
1033	wmb();
1034	i = 0;
1035	while (*confirm != 0xffffffff && i < 20) {
1036		DELAY(1000*10);
1037		i++;
1038		bus_dmamap_sync(sc->cmd_dma.dmat,
1039				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1040	}
1041	if (*confirm != 0xffffffff) {
1042		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1043			confirm, *confirm);
1044
1045		return ENXIO;
1046	}
1047	return 0;
1048}
1049
1050static int
1051mxge_update_mac_address(mxge_softc_t *sc)
1052{
1053	mxge_cmd_t cmd;
1054	uint8_t *addr = sc->mac_addr;
1055	int status;
1056
1057
1058	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1059		     | (addr[2] << 8) | addr[3]);
1060
1061	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1062
1063	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1064	return status;
1065}
1066
1067static int
1068mxge_change_pause(mxge_softc_t *sc, int pause)
1069{
1070	mxge_cmd_t cmd;
1071	int status;
1072
1073	if (pause)
1074		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1075				       &cmd);
1076	else
1077		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1078				       &cmd);
1079
1080	if (status) {
1081		device_printf(sc->dev, "Failed to set flow control mode\n");
1082		return ENXIO;
1083	}
1084	sc->pause = pause;
1085	return 0;
1086}
1087
1088static void
1089mxge_change_promisc(mxge_softc_t *sc, int promisc)
1090{
1091	mxge_cmd_t cmd;
1092	int status;
1093
1094	if (mxge_always_promisc)
1095		promisc = 1;
1096
1097	if (promisc)
1098		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1099				       &cmd);
1100	else
1101		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1102				       &cmd);
1103
1104	if (status) {
1105		device_printf(sc->dev, "Failed to set promisc mode\n");
1106	}
1107}
1108
1109static void
1110mxge_set_multicast_list(mxge_softc_t *sc)
1111{
1112	mxge_cmd_t cmd;
1113	struct ifmultiaddr *ifma;
1114	struct ifnet *ifp = sc->ifp;
1115	int err;
1116
1117	/* This firmware is known to not support multicast */
1118	if (!sc->fw_multicast_support)
1119		return;
1120
1121	/* Disable multicast filtering while we play with the lists*/
1122	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1123	if (err != 0) {
1124		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1125		       " error status: %d\n", err);
1126		return;
1127	}
1128
1129	if (sc->adopted_rx_filter_bug)
1130		return;
1131
1132	if (ifp->if_flags & IFF_ALLMULTI)
1133		/* request to disable multicast filtering, so quit here */
1134		return;
1135
1136	/* Flush all the filters */
1137
1138	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1139	if (err != 0) {
1140		device_printf(sc->dev,
1141			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1142			      ", error status: %d\n", err);
1143		return;
1144	}
1145
1146	/* Walk the multicast list, and add each address */
1147
1148	if_maddr_rlock(ifp);
1149	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1150		if (ifma->ifma_addr->sa_family != AF_LINK)
1151			continue;
1152		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1153		      &cmd.data0, 4);
1154		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1155		      &cmd.data1, 2);
1156		cmd.data0 = htonl(cmd.data0);
1157		cmd.data1 = htonl(cmd.data1);
1158		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1159		if (err != 0) {
1160			device_printf(sc->dev, "Failed "
1161			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1162			       "%d\t", err);
1163			/* abort, leaving multicast filtering off */
1164			if_maddr_runlock(ifp);
1165			return;
1166		}
1167	}
1168	if_maddr_runlock(ifp);
1169	/* Enable multicast filtering */
1170	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1171	if (err != 0) {
1172		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1173		       ", error status: %d\n", err);
1174	}
1175}
1176
1177static int
1178mxge_max_mtu(mxge_softc_t *sc)
1179{
1180	mxge_cmd_t cmd;
1181	int status;
1182
1183	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1184		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1185
1186	/* try to set nbufs to see if it we can
1187	   use virtually contiguous jumbos */
1188	cmd.data0 = 0;
1189	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1190			       &cmd);
1191	if (status == 0)
1192		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1193
1194	/* otherwise, we're limited to MJUMPAGESIZE */
1195	return MJUMPAGESIZE - MXGEFW_PAD;
1196}
1197
1198static int
1199mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1200{
1201	struct mxge_slice_state *ss;
1202	mxge_rx_done_t *rx_done;
1203	volatile uint32_t *irq_claim;
1204	mxge_cmd_t cmd;
1205	int slice, status;
1206
1207	/* try to send a reset command to the card to see if it
1208	   is alive */
1209	memset(&cmd, 0, sizeof (cmd));
1210	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1211	if (status != 0) {
1212		device_printf(sc->dev, "failed reset\n");
1213		return ENXIO;
1214	}
1215
1216	mxge_dummy_rdma(sc, 1);
1217
1218
1219	/* set the intrq size */
1220	cmd.data0 = sc->rx_ring_size;
1221	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1222
1223	/*
1224	 * Even though we already know how many slices are supported
1225	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1226	 * has magic side effects, and must be called after a reset.
1227	 * It must be called prior to calling any RSS related cmds,
1228	 * including assigning an interrupt queue for anything but
1229	 * slice 0.  It must also be called *after*
1230	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1231	 * the firmware to compute offsets.
1232	 */
1233
1234	if (sc->num_slices > 1) {
1235		/* ask the maximum number of slices it supports */
1236		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1237					   &cmd);
1238		if (status != 0) {
1239			device_printf(sc->dev,
1240				      "failed to get number of slices\n");
1241			return status;
1242		}
1243		/*
1244		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1245		 * to setting up the interrupt queue DMA
1246		 */
1247		cmd.data0 = sc->num_slices;
1248		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1249#ifdef IFNET_BUF_RING
1250		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1251#endif
1252		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1253					   &cmd);
1254		if (status != 0) {
1255			device_printf(sc->dev,
1256				      "failed to set number of slices\n");
1257			return status;
1258		}
1259	}
1260
1261
1262	if (interrupts_setup) {
1263		/* Now exchange information about interrupts  */
1264		for (slice = 0; slice < sc->num_slices; slice++) {
1265			rx_done = &sc->ss[slice].rx_done;
1266			memset(rx_done->entry, 0, sc->rx_ring_size);
1267			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1268			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1269			cmd.data2 = slice;
1270			status |= mxge_send_cmd(sc,
1271						MXGEFW_CMD_SET_INTRQ_DMA,
1272						&cmd);
1273		}
1274	}
1275
1276	status |= mxge_send_cmd(sc,
1277				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1278
1279
1280	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1281
1282	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1283	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1284
1285
1286	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1287				&cmd);
1288	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1289	if (status != 0) {
1290		device_printf(sc->dev, "failed set interrupt parameters\n");
1291		return status;
1292	}
1293
1294
1295	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1296
1297
1298	/* run a DMA benchmark */
1299	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1300
1301	for (slice = 0; slice < sc->num_slices; slice++) {
1302		ss = &sc->ss[slice];
1303
1304		ss->irq_claim = irq_claim + (2 * slice);
1305		/* reset mcp/driver shared state back to 0 */
1306		ss->rx_done.idx = 0;
1307		ss->rx_done.cnt = 0;
1308		ss->tx.req = 0;
1309		ss->tx.done = 0;
1310		ss->tx.pkt_done = 0;
1311		ss->tx.queue_active = 0;
1312		ss->tx.activate = 0;
1313		ss->tx.deactivate = 0;
1314		ss->tx.wake = 0;
1315		ss->tx.defrag = 0;
1316		ss->tx.stall = 0;
1317		ss->rx_big.cnt = 0;
1318		ss->rx_small.cnt = 0;
1319		ss->lc.lro_bad_csum = 0;
1320		ss->lc.lro_queued = 0;
1321		ss->lc.lro_flushed = 0;
1322		if (ss->fw_stats != NULL) {
1323			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1324		}
1325	}
1326	sc->rdma_tags_available = 15;
1327	status = mxge_update_mac_address(sc);
1328	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1329	mxge_change_pause(sc, sc->pause);
1330	mxge_set_multicast_list(sc);
1331	if (sc->throttle) {
1332		cmd.data0 = sc->throttle;
1333		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1334				  &cmd)) {
1335			device_printf(sc->dev,
1336				      "can't enable throttle\n");
1337		}
1338	}
1339	return status;
1340}
1341
1342static int
1343mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1344{
1345	mxge_cmd_t cmd;
1346	mxge_softc_t *sc;
1347	int err;
1348	unsigned int throttle;
1349
1350	sc = arg1;
1351	throttle = sc->throttle;
1352	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1353	if (err != 0) {
1354		return err;
1355	}
1356
1357	if (throttle == sc->throttle)
1358		return 0;
1359
1360	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1361		return EINVAL;
1362
1363	mtx_lock(&sc->driver_mtx);
1364	cmd.data0 = throttle;
1365	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1366	if (err == 0)
1367		sc->throttle = throttle;
1368	mtx_unlock(&sc->driver_mtx);
1369	return err;
1370}
1371
1372static int
1373mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1374{
1375	mxge_softc_t *sc;
1376	unsigned int intr_coal_delay;
1377	int err;
1378
1379	sc = arg1;
1380	intr_coal_delay = sc->intr_coal_delay;
1381	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1382	if (err != 0) {
1383		return err;
1384	}
1385	if (intr_coal_delay == sc->intr_coal_delay)
1386		return 0;
1387
1388	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1389		return EINVAL;
1390
1391	mtx_lock(&sc->driver_mtx);
1392	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1393	sc->intr_coal_delay = intr_coal_delay;
1394
1395	mtx_unlock(&sc->driver_mtx);
1396	return err;
1397}
1398
1399static int
1400mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1401{
1402	mxge_softc_t *sc;
1403	unsigned int enabled;
1404	int err;
1405
1406	sc = arg1;
1407	enabled = sc->pause;
1408	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1409	if (err != 0) {
1410		return err;
1411	}
1412	if (enabled == sc->pause)
1413		return 0;
1414
1415	mtx_lock(&sc->driver_mtx);
1416	err = mxge_change_pause(sc, enabled);
1417	mtx_unlock(&sc->driver_mtx);
1418	return err;
1419}
1420
1421static int
1422mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1423{
1424	int err;
1425
1426	if (arg1 == NULL)
1427		return EFAULT;
1428	arg2 = be32toh(*(int *)arg1);
1429	arg1 = NULL;
1430	err = sysctl_handle_int(oidp, arg1, arg2, req);
1431
1432	return err;
1433}
1434
1435static void
1436mxge_rem_sysctls(mxge_softc_t *sc)
1437{
1438	struct mxge_slice_state *ss;
1439	int slice;
1440
1441	if (sc->slice_sysctl_tree == NULL)
1442		return;
1443
1444	for (slice = 0; slice < sc->num_slices; slice++) {
1445		ss = &sc->ss[slice];
1446		if (ss == NULL || ss->sysctl_tree == NULL)
1447			continue;
1448		sysctl_ctx_free(&ss->sysctl_ctx);
1449		ss->sysctl_tree = NULL;
1450	}
1451	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1452	sc->slice_sysctl_tree = NULL;
1453}
1454
1455static void
1456mxge_add_sysctls(mxge_softc_t *sc)
1457{
1458	struct sysctl_ctx_list *ctx;
1459	struct sysctl_oid_list *children;
1460	mcp_irq_data_t *fw;
1461	struct mxge_slice_state *ss;
1462	int slice;
1463	char slice_num[8];
1464
1465	ctx = device_get_sysctl_ctx(sc->dev);
1466	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1467	fw = sc->ss[0].fw_stats;
1468
1469	/* random information */
1470	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1471		       "firmware_version",
1472		       CTLFLAG_RD, sc->fw_version,
1473		       0, "firmware version");
1474	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1475		       "serial_number",
1476		       CTLFLAG_RD, sc->serial_number_string,
1477		       0, "serial number");
1478	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1479		       "product_code",
1480		       CTLFLAG_RD, sc->product_code_string,
1481		       0, "product_code");
1482	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1483		       "pcie_link_width",
1484		       CTLFLAG_RD, &sc->link_width,
1485		       0, "tx_boundary");
1486	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1487		       "tx_boundary",
1488		       CTLFLAG_RD, &sc->tx_boundary,
1489		       0, "tx_boundary");
1490	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1491		       "write_combine",
1492		       CTLFLAG_RD, &sc->wc,
1493		       0, "write combining PIO?");
1494	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1495		       "read_dma_MBs",
1496		       CTLFLAG_RD, &sc->read_dma,
1497		       0, "DMA Read speed in MB/s");
1498	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1499		       "write_dma_MBs",
1500		       CTLFLAG_RD, &sc->write_dma,
1501		       0, "DMA Write speed in MB/s");
1502	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1503		       "read_write_dma_MBs",
1504		       CTLFLAG_RD, &sc->read_write_dma,
1505		       0, "DMA concurrent Read/Write speed in MB/s");
1506	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1507		       "watchdog_resets",
1508		       CTLFLAG_RD, &sc->watchdog_resets,
1509		       0, "Number of times NIC was reset");
1510
1511
1512	/* performance related tunables */
1513	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1514			"intr_coal_delay",
1515			CTLTYPE_INT|CTLFLAG_RW, sc,
1516			0, mxge_change_intr_coal,
1517			"I", "interrupt coalescing delay in usecs");
1518
1519	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1520			"throttle",
1521			CTLTYPE_INT|CTLFLAG_RW, sc,
1522			0, mxge_change_throttle,
1523			"I", "transmit throttling");
1524
1525	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1526			"flow_control_enabled",
1527			CTLTYPE_INT|CTLFLAG_RW, sc,
1528			0, mxge_change_flow_control,
1529			"I", "interrupt coalescing delay in usecs");
1530
1531	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1532		       "deassert_wait",
1533		       CTLFLAG_RW, &mxge_deassert_wait,
1534		       0, "Wait for IRQ line to go low in ihandler");
1535
1536	/* stats block from firmware is in network byte order.
1537	   Need to swap it */
1538	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1539			"link_up",
1540			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1541			0, mxge_handle_be32,
1542			"I", "link up");
1543	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1544			"rdma_tags_available",
1545			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1546			0, mxge_handle_be32,
1547			"I", "rdma_tags_available");
1548	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549			"dropped_bad_crc32",
1550			CTLTYPE_INT|CTLFLAG_RD,
1551			&fw->dropped_bad_crc32,
1552			0, mxge_handle_be32,
1553			"I", "dropped_bad_crc32");
1554	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1555			"dropped_bad_phy",
1556			CTLTYPE_INT|CTLFLAG_RD,
1557			&fw->dropped_bad_phy,
1558			0, mxge_handle_be32,
1559			"I", "dropped_bad_phy");
1560	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1561			"dropped_link_error_or_filtered",
1562			CTLTYPE_INT|CTLFLAG_RD,
1563			&fw->dropped_link_error_or_filtered,
1564			0, mxge_handle_be32,
1565			"I", "dropped_link_error_or_filtered");
1566	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1567			"dropped_link_overflow",
1568			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1569			0, mxge_handle_be32,
1570			"I", "dropped_link_overflow");
1571	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1572			"dropped_multicast_filtered",
1573			CTLTYPE_INT|CTLFLAG_RD,
1574			&fw->dropped_multicast_filtered,
1575			0, mxge_handle_be32,
1576			"I", "dropped_multicast_filtered");
1577	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1578			"dropped_no_big_buffer",
1579			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1580			0, mxge_handle_be32,
1581			"I", "dropped_no_big_buffer");
1582	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583			"dropped_no_small_buffer",
1584			CTLTYPE_INT|CTLFLAG_RD,
1585			&fw->dropped_no_small_buffer,
1586			0, mxge_handle_be32,
1587			"I", "dropped_no_small_buffer");
1588	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1589			"dropped_overrun",
1590			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1591			0, mxge_handle_be32,
1592			"I", "dropped_overrun");
1593	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1594			"dropped_pause",
1595			CTLTYPE_INT|CTLFLAG_RD,
1596			&fw->dropped_pause,
1597			0, mxge_handle_be32,
1598			"I", "dropped_pause");
1599	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1600			"dropped_runt",
1601			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1602			0, mxge_handle_be32,
1603			"I", "dropped_runt");
1604
1605	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1606			"dropped_unicast_filtered",
1607			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1608			0, mxge_handle_be32,
1609			"I", "dropped_unicast_filtered");
1610
1611	/* verbose printing? */
1612	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1613		       "verbose",
1614		       CTLFLAG_RW, &mxge_verbose,
1615		       0, "verbose printing");
1616
1617	/* add counters exported for debugging from all slices */
1618	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1619	sc->slice_sysctl_tree =
1620		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1621				"slice", CTLFLAG_RD, 0, "");
1622
1623	for (slice = 0; slice < sc->num_slices; slice++) {
1624		ss = &sc->ss[slice];
1625		sysctl_ctx_init(&ss->sysctl_ctx);
1626		ctx = &ss->sysctl_ctx;
1627		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1628		sprintf(slice_num, "%d", slice);
1629		ss->sysctl_tree =
1630			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1631					CTLFLAG_RD, 0, "");
1632		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1633		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1634			       "rx_small_cnt",
1635			       CTLFLAG_RD, &ss->rx_small.cnt,
1636			       0, "rx_small_cnt");
1637		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1638			       "rx_big_cnt",
1639			       CTLFLAG_RD, &ss->rx_big.cnt,
1640			       0, "rx_small_cnt");
1641		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1642			       "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1643			       0, "number of lro merge queues flushed");
1644
1645		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1646			       "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1647			       0, "number of bad csums preventing LRO");
1648
1649		SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1650			       "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1651			       0, "number of frames appended to lro merge"
1652			       "queues");
1653
1654#ifndef IFNET_BUF_RING
1655		/* only transmit from slice 0 for now */
1656		if (slice > 0)
1657			continue;
1658#endif
1659		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1660			       "tx_req",
1661			       CTLFLAG_RD, &ss->tx.req,
1662			       0, "tx_req");
1663
1664		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1665			       "tx_done",
1666			       CTLFLAG_RD, &ss->tx.done,
1667			       0, "tx_done");
1668		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1669			       "tx_pkt_done",
1670			       CTLFLAG_RD, &ss->tx.pkt_done,
1671			       0, "tx_done");
1672		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1673			       "tx_stall",
1674			       CTLFLAG_RD, &ss->tx.stall,
1675			       0, "tx_stall");
1676		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1677			       "tx_wake",
1678			       CTLFLAG_RD, &ss->tx.wake,
1679			       0, "tx_wake");
1680		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1681			       "tx_defrag",
1682			       CTLFLAG_RD, &ss->tx.defrag,
1683			       0, "tx_defrag");
1684		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1685			       "tx_queue_active",
1686			       CTLFLAG_RD, &ss->tx.queue_active,
1687			       0, "tx_queue_active");
1688		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1689			       "tx_activate",
1690			       CTLFLAG_RD, &ss->tx.activate,
1691			       0, "tx_activate");
1692		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1693			       "tx_deactivate",
1694			       CTLFLAG_RD, &ss->tx.deactivate,
1695			       0, "tx_deactivate");
1696	}
1697}
1698
1699/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1700   backwards one at a time and handle ring wraps */
1701
1702static inline void
1703mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1704			    mcp_kreq_ether_send_t *src, int cnt)
1705{
1706	int idx, starting_slot;
1707	starting_slot = tx->req;
1708	while (cnt > 1) {
1709		cnt--;
1710		idx = (starting_slot + cnt) & tx->mask;
1711		mxge_pio_copy(&tx->lanai[idx],
1712			      &src[cnt], sizeof(*src));
1713		wmb();
1714	}
1715}
1716
1717/*
1718 * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1719 * at most 32 bytes at a time, so as to avoid involving the software
1720 * pio handler in the nic.   We re-write the first segment's flags
1721 * to mark them valid only after writing the entire chain
1722 */
1723
1724static inline void
1725mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1726		  int cnt)
1727{
1728	int idx, i;
1729	uint32_t *src_ints;
1730	volatile uint32_t *dst_ints;
1731	mcp_kreq_ether_send_t *srcp;
1732	volatile mcp_kreq_ether_send_t *dstp, *dst;
1733	uint8_t last_flags;
1734
1735	idx = tx->req & tx->mask;
1736
1737	last_flags = src->flags;
1738	src->flags = 0;
1739	wmb();
1740	dst = dstp = &tx->lanai[idx];
1741	srcp = src;
1742
1743	if ((idx + cnt) < tx->mask) {
1744		for (i = 0; i < (cnt - 1); i += 2) {
1745			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1746			wmb(); /* force write every 32 bytes */
1747			srcp += 2;
1748			dstp += 2;
1749		}
1750	} else {
1751		/* submit all but the first request, and ensure
1752		   that it is submitted below */
1753		mxge_submit_req_backwards(tx, src, cnt);
1754		i = 0;
1755	}
1756	if (i < cnt) {
1757		/* submit the first request */
1758		mxge_pio_copy(dstp, srcp, sizeof(*src));
1759		wmb(); /* barrier before setting valid flag */
1760	}
1761
1762	/* re-write the last 32-bits with the valid flags */
1763	src->flags = last_flags;
1764	src_ints = (uint32_t *)src;
1765	src_ints+=3;
1766	dst_ints = (volatile uint32_t *)dst;
1767	dst_ints+=3;
1768	*dst_ints =  *src_ints;
1769	tx->req += cnt;
1770	wmb();
1771}
1772
1773static int
1774mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1775    struct mxge_pkt_info *pi)
1776{
1777	struct ether_vlan_header *eh;
1778	uint16_t etype;
1779	int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1780#if IFCAP_TSO6 && defined(INET6)
1781	int nxt;
1782#endif
1783
1784	eh = mtod(m, struct ether_vlan_header *);
1785	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1786		etype = ntohs(eh->evl_proto);
1787		pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1788	} else {
1789		etype = ntohs(eh->evl_encap_proto);
1790		pi->ip_off = ETHER_HDR_LEN;
1791	}
1792
1793	switch (etype) {
1794	case ETHERTYPE_IP:
1795		/*
1796		 * ensure ip header is in first mbuf, copy it to a
1797		 * scratch buffer if not
1798		 */
1799		pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1800		pi->ip6 = NULL;
1801		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1802			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1803			    ss->scratch);
1804			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1805		}
1806		pi->ip_hlen = pi->ip->ip_hl << 2;
1807		if (!tso)
1808			return 0;
1809
1810		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1811		    sizeof(struct tcphdr))) {
1812			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1813			    sizeof(struct tcphdr), ss->scratch);
1814			pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1815		}
1816		pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1817		break;
1818#if IFCAP_TSO6 && defined(INET6)
1819	case ETHERTYPE_IPV6:
1820		pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1821		if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1822			m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1823			    ss->scratch);
1824			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1825		}
1826		nxt = 0;
1827		pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1828		pi->ip_hlen -= pi->ip_off;
1829		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1830			return EINVAL;
1831
1832		if (!tso)
1833			return 0;
1834
1835		if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1836			return EINVAL;
1837
1838		if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1839		    sizeof(struct tcphdr))) {
1840			m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1841			    sizeof(struct tcphdr), ss->scratch);
1842			pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1843		}
1844		pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1845		break;
1846#endif
1847	default:
1848		return EINVAL;
1849	}
1850	return 0;
1851}
1852
1853#if IFCAP_TSO4
1854
1855static void
1856mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1857	       int busdma_seg_cnt, struct mxge_pkt_info *pi)
1858{
1859	mxge_tx_ring_t *tx;
1860	mcp_kreq_ether_send_t *req;
1861	bus_dma_segment_t *seg;
1862	uint32_t low, high_swapped;
1863	int len, seglen, cum_len, cum_len_next;
1864	int next_is_first, chop, cnt, rdma_count, small;
1865	uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1866	uint8_t flags, flags_next;
1867	static int once;
1868
1869	mss = m->m_pkthdr.tso_segsz;
1870
1871	/* negative cum_len signifies to the
1872	 * send loop that we are still in the
1873	 * header portion of the TSO packet.
1874	 */
1875
1876	cksum_offset = pi->ip_off + pi->ip_hlen;
1877	cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1878
1879	/* TSO implies checksum offload on this hardware */
1880	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1881		/*
1882		 * If packet has full TCP csum, replace it with pseudo hdr
1883		 * sum that the NIC expects, otherwise the NIC will emit
1884		 * packets with bad TCP checksums.
1885		 */
1886		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1887		if (pi->ip6) {
1888#if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1889			m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1890			sum = in6_cksum_pseudo(pi->ip6,
1891			    m->m_pkthdr.len - cksum_offset,
1892			    IPPROTO_TCP, 0);
1893#endif
1894		} else {
1895#ifdef INET
1896			m->m_pkthdr.csum_flags |= CSUM_TCP;
1897			sum = in_pseudo(pi->ip->ip_src.s_addr,
1898			    pi->ip->ip_dst.s_addr,
1899			    htons(IPPROTO_TCP + (m->m_pkthdr.len -
1900				    cksum_offset)));
1901#endif
1902		}
1903		m_copyback(m, offsetof(struct tcphdr, th_sum) +
1904		    cksum_offset, sizeof(sum), (caddr_t)&sum);
1905	}
1906	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1907
1908
1909	/* for TSO, pseudo_hdr_offset holds mss.
1910	 * The firmware figures out where to put
1911	 * the checksum by parsing the header. */
1912	pseudo_hdr_offset = htobe16(mss);
1913
1914	if (pi->ip6) {
1915		/*
1916		 * for IPv6 TSO, the "checksum offset" is re-purposed
1917		 * to store the TCP header len
1918		 */
1919		cksum_offset = (pi->tcp->th_off << 2);
1920	}
1921
1922	tx = &ss->tx;
1923	req = tx->req_list;
1924	seg = tx->seg_list;
1925	cnt = 0;
1926	rdma_count = 0;
1927	/* "rdma_count" is the number of RDMAs belonging to the
1928	 * current packet BEFORE the current send request. For
1929	 * non-TSO packets, this is equal to "count".
1930	 * For TSO packets, rdma_count needs to be reset
1931	 * to 0 after a segment cut.
1932	 *
1933	 * The rdma_count field of the send request is
1934	 * the number of RDMAs of the packet starting at
1935	 * that request. For TSO send requests with one ore more cuts
1936	 * in the middle, this is the number of RDMAs starting
1937	 * after the last cut in the request. All previous
1938	 * segments before the last cut implicitly have 1 RDMA.
1939	 *
1940	 * Since the number of RDMAs is not known beforehand,
1941	 * it must be filled-in retroactively - after each
1942	 * segmentation cut or at the end of the entire packet.
1943	 */
1944
1945	while (busdma_seg_cnt) {
1946		/* Break the busdma segment up into pieces*/
1947		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1948		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1949		len = seg->ds_len;
1950
1951		while (len) {
1952			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1953			seglen = len;
1954			cum_len_next = cum_len + seglen;
1955			(req-rdma_count)->rdma_count = rdma_count + 1;
1956			if (__predict_true(cum_len >= 0)) {
1957				/* payload */
1958				chop = (cum_len_next > mss);
1959				cum_len_next = cum_len_next % mss;
1960				next_is_first = (cum_len_next == 0);
1961				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1962				flags_next |= next_is_first *
1963					MXGEFW_FLAGS_FIRST;
1964				rdma_count |= -(chop | next_is_first);
1965				rdma_count += chop & !next_is_first;
1966			} else if (cum_len_next >= 0) {
1967				/* header ends */
1968				rdma_count = -1;
1969				cum_len_next = 0;
1970				seglen = -cum_len;
1971				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1972				flags_next = MXGEFW_FLAGS_TSO_PLD |
1973					MXGEFW_FLAGS_FIRST |
1974					(small * MXGEFW_FLAGS_SMALL);
1975			    }
1976
1977			req->addr_high = high_swapped;
1978			req->addr_low = htobe32(low);
1979			req->pseudo_hdr_offset = pseudo_hdr_offset;
1980			req->pad = 0;
1981			req->rdma_count = 1;
1982			req->length = htobe16(seglen);
1983			req->cksum_offset = cksum_offset;
1984			req->flags = flags | ((cum_len & 1) *
1985					      MXGEFW_FLAGS_ALIGN_ODD);
1986			low += seglen;
1987			len -= seglen;
1988			cum_len = cum_len_next;
1989			flags = flags_next;
1990			req++;
1991			cnt++;
1992			rdma_count++;
1993			if (cksum_offset != 0 && !pi->ip6) {
1994				if (__predict_false(cksum_offset > seglen))
1995					cksum_offset -= seglen;
1996				else
1997					cksum_offset = 0;
1998			}
1999			if (__predict_false(cnt > tx->max_desc))
2000				goto drop;
2001		}
2002		busdma_seg_cnt--;
2003		seg++;
2004	}
2005	(req-rdma_count)->rdma_count = rdma_count;
2006
2007	do {
2008		req--;
2009		req->flags |= MXGEFW_FLAGS_TSO_LAST;
2010	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2011
2012	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2013	mxge_submit_req(tx, tx->req_list, cnt);
2014#ifdef IFNET_BUF_RING
2015	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2016		/* tell the NIC to start polling this slice */
2017		*tx->send_go = 1;
2018		tx->queue_active = 1;
2019		tx->activate++;
2020		wmb();
2021	}
2022#endif
2023	return;
2024
2025drop:
2026	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2027	m_freem(m);
2028	ss->oerrors++;
2029	if (!once) {
2030		printf("tx->max_desc exceeded via TSO!\n");
2031		printf("mss = %d, %ld, %d!\n", mss,
2032		       (long)seg - (long)tx->seg_list, tx->max_desc);
2033		once = 1;
2034	}
2035	return;
2036
2037}
2038
2039#endif /* IFCAP_TSO4 */
2040
2041#ifdef MXGE_NEW_VLAN_API
2042/*
2043 * We reproduce the software vlan tag insertion from
2044 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2045 * vlan tag insertion. We need to advertise this in order to have the
2046 * vlan interface respect our csum offload flags.
2047 */
2048static struct mbuf *
2049mxge_vlan_tag_insert(struct mbuf *m)
2050{
2051	struct ether_vlan_header *evl;
2052
2053	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2054	if (__predict_false(m == NULL))
2055		return NULL;
2056	if (m->m_len < sizeof(*evl)) {
2057		m = m_pullup(m, sizeof(*evl));
2058		if (__predict_false(m == NULL))
2059			return NULL;
2060	}
2061	/*
2062	 * Transform the Ethernet header into an Ethernet header
2063	 * with 802.1Q encapsulation.
2064	 */
2065	evl = mtod(m, struct ether_vlan_header *);
2066	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2067	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2068	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2069	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2070	m->m_flags &= ~M_VLANTAG;
2071	return m;
2072}
2073#endif /* MXGE_NEW_VLAN_API */
2074
2075static void
2076mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2077{
2078	struct mxge_pkt_info pi = {0,0,0,0};
2079	mxge_softc_t *sc;
2080	mcp_kreq_ether_send_t *req;
2081	bus_dma_segment_t *seg;
2082	struct mbuf *m_tmp;
2083	struct ifnet *ifp;
2084	mxge_tx_ring_t *tx;
2085	int cnt, cum_len, err, i, idx, odd_flag;
2086	uint16_t pseudo_hdr_offset;
2087	uint8_t flags, cksum_offset;
2088
2089
2090	sc = ss->sc;
2091	ifp = sc->ifp;
2092	tx = &ss->tx;
2093
2094#ifdef MXGE_NEW_VLAN_API
2095	if (m->m_flags & M_VLANTAG) {
2096		m = mxge_vlan_tag_insert(m);
2097		if (__predict_false(m == NULL))
2098			goto drop_without_m;
2099	}
2100#endif
2101	if (m->m_pkthdr.csum_flags &
2102	    (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2103		if (mxge_parse_tx(ss, m, &pi))
2104			goto drop;
2105	}
2106
2107	/* (try to) map the frame for DMA */
2108	idx = tx->req & tx->mask;
2109	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2110				      m, tx->seg_list, &cnt,
2111				      BUS_DMA_NOWAIT);
2112	if (__predict_false(err == EFBIG)) {
2113		/* Too many segments in the chain.  Try
2114		   to defrag */
2115		m_tmp = m_defrag(m, M_NOWAIT);
2116		if (m_tmp == NULL) {
2117			goto drop;
2118		}
2119		ss->tx.defrag++;
2120		m = m_tmp;
2121		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2122					      tx->info[idx].map,
2123					      m, tx->seg_list, &cnt,
2124					      BUS_DMA_NOWAIT);
2125	}
2126	if (__predict_false(err != 0)) {
2127		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2128			      " packet len = %d\n", err, m->m_pkthdr.len);
2129		goto drop;
2130	}
2131	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2132			BUS_DMASYNC_PREWRITE);
2133	tx->info[idx].m = m;
2134
2135#if IFCAP_TSO4
2136	/* TSO is different enough, we handle it in another routine */
2137	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2138		mxge_encap_tso(ss, m, cnt, &pi);
2139		return;
2140	}
2141#endif
2142
2143	req = tx->req_list;
2144	cksum_offset = 0;
2145	pseudo_hdr_offset = 0;
2146	flags = MXGEFW_FLAGS_NO_TSO;
2147
2148	/* checksum offloading? */
2149	if (m->m_pkthdr.csum_flags &
2150	    (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2151		/* ensure ip header is in first mbuf, copy
2152		   it to a scratch buffer if not */
2153		cksum_offset = pi.ip_off + pi.ip_hlen;
2154		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2155		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2156		req->cksum_offset = cksum_offset;
2157		flags |= MXGEFW_FLAGS_CKSUM;
2158		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2159	} else {
2160		odd_flag = 0;
2161	}
2162	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2163		flags |= MXGEFW_FLAGS_SMALL;
2164
2165	/* convert segments into a request list */
2166	cum_len = 0;
2167	seg = tx->seg_list;
2168	req->flags = MXGEFW_FLAGS_FIRST;
2169	for (i = 0; i < cnt; i++) {
2170		req->addr_low =
2171			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2172		req->addr_high =
2173			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2174		req->length = htobe16(seg->ds_len);
2175		req->cksum_offset = cksum_offset;
2176		if (cksum_offset > seg->ds_len)
2177			cksum_offset -= seg->ds_len;
2178		else
2179			cksum_offset = 0;
2180		req->pseudo_hdr_offset = pseudo_hdr_offset;
2181		req->pad = 0; /* complete solid 16-byte block */
2182		req->rdma_count = 1;
2183		req->flags |= flags | ((cum_len & 1) * odd_flag);
2184		cum_len += seg->ds_len;
2185		seg++;
2186		req++;
2187		req->flags = 0;
2188	}
2189	req--;
2190	/* pad runts to 60 bytes */
2191	if (cum_len < 60) {
2192		req++;
2193		req->addr_low =
2194			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2195		req->addr_high =
2196			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2197		req->length = htobe16(60 - cum_len);
2198		req->cksum_offset = 0;
2199		req->pseudo_hdr_offset = pseudo_hdr_offset;
2200		req->pad = 0; /* complete solid 16-byte block */
2201		req->rdma_count = 1;
2202		req->flags |= flags | ((cum_len & 1) * odd_flag);
2203		cnt++;
2204	}
2205
2206	tx->req_list[0].rdma_count = cnt;
2207#if 0
2208	/* print what the firmware will see */
2209	for (i = 0; i < cnt; i++) {
2210		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2211		    "cso:%d, flags:0x%x, rdma:%d\n",
2212		    i, (int)ntohl(tx->req_list[i].addr_high),
2213		    (int)ntohl(tx->req_list[i].addr_low),
2214		    (int)ntohs(tx->req_list[i].length),
2215		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2216		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2217		    tx->req_list[i].rdma_count);
2218	}
2219	printf("--------------\n");
2220#endif
2221	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2222	mxge_submit_req(tx, tx->req_list, cnt);
2223#ifdef IFNET_BUF_RING
2224	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2225		/* tell the NIC to start polling this slice */
2226		*tx->send_go = 1;
2227		tx->queue_active = 1;
2228		tx->activate++;
2229		wmb();
2230	}
2231#endif
2232	return;
2233
2234drop:
2235	m_freem(m);
2236drop_without_m:
2237	ss->oerrors++;
2238	return;
2239}
2240
2241#ifdef IFNET_BUF_RING
2242static void
2243mxge_qflush(struct ifnet *ifp)
2244{
2245	mxge_softc_t *sc = ifp->if_softc;
2246	mxge_tx_ring_t *tx;
2247	struct mbuf *m;
2248	int slice;
2249
2250	for (slice = 0; slice < sc->num_slices; slice++) {
2251		tx = &sc->ss[slice].tx;
2252		mtx_lock(&tx->mtx);
2253		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2254			m_freem(m);
2255		mtx_unlock(&tx->mtx);
2256	}
2257	if_qflush(ifp);
2258}
2259
2260static inline void
2261mxge_start_locked(struct mxge_slice_state *ss)
2262{
2263	mxge_softc_t *sc;
2264	struct mbuf *m;
2265	struct ifnet *ifp;
2266	mxge_tx_ring_t *tx;
2267
2268	sc = ss->sc;
2269	ifp = sc->ifp;
2270	tx = &ss->tx;
2271
2272	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2273		m = drbr_dequeue(ifp, tx->br);
2274		if (m == NULL) {
2275			return;
2276		}
2277		/* let BPF see it */
2278		BPF_MTAP(ifp, m);
2279
2280		/* give it to the nic */
2281		mxge_encap(ss, m);
2282	}
2283	/* ran out of transmit slots */
2284	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2285	    && (!drbr_empty(ifp, tx->br))) {
2286		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2287		tx->stall++;
2288	}
2289}
2290
2291static int
2292mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2293{
2294	mxge_softc_t *sc;
2295	struct ifnet *ifp;
2296	mxge_tx_ring_t *tx;
2297	int err;
2298
2299	sc = ss->sc;
2300	ifp = sc->ifp;
2301	tx = &ss->tx;
2302
2303	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2304	    IFF_DRV_RUNNING) {
2305		err = drbr_enqueue(ifp, tx->br, m);
2306		return (err);
2307	}
2308
2309	if (!drbr_needs_enqueue(ifp, tx->br) &&
2310	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2311		/* let BPF see it */
2312		BPF_MTAP(ifp, m);
2313		/* give it to the nic */
2314		mxge_encap(ss, m);
2315	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2316		return (err);
2317	}
2318	if (!drbr_empty(ifp, tx->br))
2319		mxge_start_locked(ss);
2320	return (0);
2321}
2322
2323static int
2324mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2325{
2326	mxge_softc_t *sc = ifp->if_softc;
2327	struct mxge_slice_state *ss;
2328	mxge_tx_ring_t *tx;
2329	int err = 0;
2330	int slice;
2331
2332	slice = m->m_pkthdr.flowid;
2333	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2334
2335	ss = &sc->ss[slice];
2336	tx = &ss->tx;
2337
2338	if (mtx_trylock(&tx->mtx)) {
2339		err = mxge_transmit_locked(ss, m);
2340		mtx_unlock(&tx->mtx);
2341	} else {
2342		err = drbr_enqueue(ifp, tx->br, m);
2343	}
2344
2345	return (err);
2346}
2347
2348#else
2349
2350static inline void
2351mxge_start_locked(struct mxge_slice_state *ss)
2352{
2353	mxge_softc_t *sc;
2354	struct mbuf *m;
2355	struct ifnet *ifp;
2356	mxge_tx_ring_t *tx;
2357
2358	sc = ss->sc;
2359	ifp = sc->ifp;
2360	tx = &ss->tx;
2361	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2362		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2363		if (m == NULL) {
2364			return;
2365		}
2366		/* let BPF see it */
2367		BPF_MTAP(ifp, m);
2368
2369		/* give it to the nic */
2370		mxge_encap(ss, m);
2371	}
2372	/* ran out of transmit slots */
2373	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2374		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2375		tx->stall++;
2376	}
2377}
2378#endif
2379static void
2380mxge_start(struct ifnet *ifp)
2381{
2382	mxge_softc_t *sc = ifp->if_softc;
2383	struct mxge_slice_state *ss;
2384
2385	/* only use the first slice for now */
2386	ss = &sc->ss[0];
2387	mtx_lock(&ss->tx.mtx);
2388	mxge_start_locked(ss);
2389	mtx_unlock(&ss->tx.mtx);
2390}
2391
2392/*
2393 * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2394 * at most 32 bytes at a time, so as to avoid involving the software
2395 * pio handler in the nic.   We re-write the first segment's low
2396 * DMA address to mark it valid only after we write the entire chunk
2397 * in a burst
2398 */
2399static inline void
2400mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2401		mcp_kreq_ether_recv_t *src)
2402{
2403	uint32_t low;
2404
2405	low = src->addr_low;
2406	src->addr_low = 0xffffffff;
2407	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2408	wmb();
2409	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2410	wmb();
2411	src->addr_low = low;
2412	dst->addr_low = low;
2413	wmb();
2414}
2415
2416static int
2417mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2418{
2419	bus_dma_segment_t seg;
2420	struct mbuf *m;
2421	mxge_rx_ring_t *rx = &ss->rx_small;
2422	int cnt, err;
2423
2424	m = m_gethdr(M_NOWAIT, MT_DATA);
2425	if (m == NULL) {
2426		rx->alloc_fail++;
2427		err = ENOBUFS;
2428		goto done;
2429	}
2430	m->m_len = MHLEN;
2431	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2432				      &seg, &cnt, BUS_DMA_NOWAIT);
2433	if (err != 0) {
2434		m_free(m);
2435		goto done;
2436	}
2437	rx->info[idx].m = m;
2438	rx->shadow[idx].addr_low =
2439		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2440	rx->shadow[idx].addr_high =
2441		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2442
2443done:
2444	if ((idx & 7) == 7)
2445		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2446	return err;
2447}
2448
2449static int
2450mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2451{
2452	bus_dma_segment_t seg[3];
2453	struct mbuf *m;
2454	mxge_rx_ring_t *rx = &ss->rx_big;
2455	int cnt, err, i;
2456
2457	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2458	if (m == NULL) {
2459		rx->alloc_fail++;
2460		err = ENOBUFS;
2461		goto done;
2462	}
2463	m->m_len = rx->mlen;
2464	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2465				      seg, &cnt, BUS_DMA_NOWAIT);
2466	if (err != 0) {
2467		m_free(m);
2468		goto done;
2469	}
2470	rx->info[idx].m = m;
2471	rx->shadow[idx].addr_low =
2472		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2473	rx->shadow[idx].addr_high =
2474		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2475
2476#if MXGE_VIRT_JUMBOS
2477	for (i = 1; i < cnt; i++) {
2478		rx->shadow[idx + i].addr_low =
2479			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2480		rx->shadow[idx + i].addr_high =
2481			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2482       }
2483#endif
2484
2485done:
2486       for (i = 0; i < rx->nbufs; i++) {
2487		if ((idx & 7) == 7) {
2488			mxge_submit_8rx(&rx->lanai[idx - 7],
2489					&rx->shadow[idx - 7]);
2490		}
2491		idx++;
2492	}
2493	return err;
2494}
2495
2496#ifdef INET6
2497
2498static uint16_t
2499mxge_csum_generic(uint16_t *raw, int len)
2500{
2501	uint32_t csum;
2502
2503
2504	csum = 0;
2505	while (len > 0) {
2506		csum += *raw;
2507		raw++;
2508		len -= 2;
2509	}
2510	csum = (csum >> 16) + (csum & 0xffff);
2511	csum = (csum >> 16) + (csum & 0xffff);
2512	return (uint16_t)csum;
2513}
2514
2515static inline uint16_t
2516mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2517{
2518	uint32_t partial;
2519	int nxt, cksum_offset;
2520	struct ip6_hdr *ip6 = p;
2521	uint16_t c;
2522
2523	nxt = ip6->ip6_nxt;
2524	cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2525	if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2526		cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2527					   IPPROTO_IPV6, &nxt);
2528		if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2529			return (1);
2530	}
2531
2532	/*
2533	 * IPv6 headers do not contain a checksum, and hence
2534	 * do not checksum to zero, so they don't "fall out"
2535	 * of the partial checksum calculation like IPv4
2536	 * headers do.  We need to fix the partial checksum by
2537	 * subtracting the checksum of the IPv6 header.
2538	 */
2539
2540	partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2541				    ETHER_HDR_LEN);
2542	csum += ~partial;
2543	csum +=	 (csum < ~partial);
2544	csum = (csum >> 16) + (csum & 0xFFFF);
2545	csum = (csum >> 16) + (csum & 0xFFFF);
2546	c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2547			     csum);
2548	c ^= 0xffff;
2549	return (c);
2550}
2551#endif /* INET6 */
2552/*
2553 *  Myri10GE hardware checksums are not valid if the sender
2554 *  padded the frame with non-zero padding.  This is because
2555 *  the firmware just does a simple 16-bit 1s complement
2556 *  checksum across the entire frame, excluding the first 14
2557 *  bytes.  It is best to simply to check the checksum and
2558 *  tell the stack about it only if the checksum is good
2559 */
2560
2561static inline uint16_t
2562mxge_rx_csum(struct mbuf *m, int csum)
2563{
2564	struct ether_header *eh;
2565#ifdef INET
2566	struct ip *ip;
2567#endif
2568#if defined(INET) || defined(INET6)
2569	int cap = m->m_pkthdr.rcvif->if_capenable;
2570#endif
2571	uint16_t c, etype;
2572
2573
2574	eh = mtod(m, struct ether_header *);
2575	etype = ntohs(eh->ether_type);
2576	switch (etype) {
2577#ifdef INET
2578	case ETHERTYPE_IP:
2579		if ((cap & IFCAP_RXCSUM) == 0)
2580			return (1);
2581		ip = (struct ip *)(eh + 1);
2582		if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2583			return (1);
2584		c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2585			      htonl(ntohs(csum) + ntohs(ip->ip_len) -
2586				    (ip->ip_hl << 2) + ip->ip_p));
2587		c ^= 0xffff;
2588		break;
2589#endif
2590#ifdef INET6
2591	case ETHERTYPE_IPV6:
2592		if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2593			return (1);
2594		c = mxge_rx_csum6((eh + 1), m, csum);
2595		break;
2596#endif
2597	default:
2598		c = 1;
2599	}
2600	return (c);
2601}
2602
2603static void
2604mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2605{
2606	struct ether_vlan_header *evl;
2607	struct ether_header *eh;
2608	uint32_t partial;
2609
2610	evl = mtod(m, struct ether_vlan_header *);
2611	eh = mtod(m, struct ether_header *);
2612
2613	/*
2614	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2615	 * after what the firmware thought was the end of the ethernet
2616	 * header.
2617	 */
2618
2619	/* put checksum into host byte order */
2620	*csum = ntohs(*csum);
2621	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2622	(*csum) += ~partial;
2623	(*csum) +=  ((*csum) < ~partial);
2624	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2625	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2626
2627	/* restore checksum to network byte order;
2628	   later consumers expect this */
2629	*csum = htons(*csum);
2630
2631	/* save the tag */
2632#ifdef MXGE_NEW_VLAN_API
2633	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2634#else
2635	{
2636		struct m_tag *mtag;
2637		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2638				   M_NOWAIT);
2639		if (mtag == NULL)
2640			return;
2641		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2642		m_tag_prepend(m, mtag);
2643	}
2644
2645#endif
2646	m->m_flags |= M_VLANTAG;
2647
2648	/*
2649	 * Remove the 802.1q header by copying the Ethernet
2650	 * addresses over it and adjusting the beginning of
2651	 * the data in the mbuf.  The encapsulated Ethernet
2652	 * type field is already in place.
2653	 */
2654	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2655	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2656	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2657}
2658
2659
2660static inline void
2661mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2662		 uint32_t csum, int lro)
2663{
2664	mxge_softc_t *sc;
2665	struct ifnet *ifp;
2666	struct mbuf *m;
2667	struct ether_header *eh;
2668	mxge_rx_ring_t *rx;
2669	bus_dmamap_t old_map;
2670	int idx;
2671
2672	sc = ss->sc;
2673	ifp = sc->ifp;
2674	rx = &ss->rx_big;
2675	idx = rx->cnt & rx->mask;
2676	rx->cnt += rx->nbufs;
2677	/* save a pointer to the received mbuf */
2678	m = rx->info[idx].m;
2679	/* try to replace the received mbuf */
2680	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2681		/* drop the frame -- the old mbuf is re-cycled */
2682		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2683		return;
2684	}
2685
2686	/* unmap the received buffer */
2687	old_map = rx->info[idx].map;
2688	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2689	bus_dmamap_unload(rx->dmat, old_map);
2690
2691	/* swap the bus_dmamap_t's */
2692	rx->info[idx].map = rx->extra_map;
2693	rx->extra_map = old_map;
2694
2695	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2696	 * aligned */
2697	m->m_data += MXGEFW_PAD;
2698
2699	m->m_pkthdr.rcvif = ifp;
2700	m->m_len = m->m_pkthdr.len = len;
2701	ss->ipackets++;
2702	eh = mtod(m, struct ether_header *);
2703	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2704		mxge_vlan_tag_remove(m, &csum);
2705	}
2706	/* flowid only valid if RSS hashing is enabled */
2707	if (sc->num_slices > 1) {
2708		m->m_pkthdr.flowid = (ss - sc->ss);
2709		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2710	}
2711	/* if the checksum is valid, mark it in the mbuf header */
2712	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2713	    (0 == mxge_rx_csum(m, csum))) {
2714		/* Tell the stack that the  checksum is good */
2715		m->m_pkthdr.csum_data = 0xffff;
2716		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2717			CSUM_DATA_VALID;
2718
2719#if defined(INET) || defined (INET6)
2720		if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2721			return;
2722#endif
2723	}
2724	/* pass the frame up the stack */
2725	(*ifp->if_input)(ifp, m);
2726}
2727
2728static inline void
2729mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2730		   uint32_t csum, int lro)
2731{
2732	mxge_softc_t *sc;
2733	struct ifnet *ifp;
2734	struct ether_header *eh;
2735	struct mbuf *m;
2736	mxge_rx_ring_t *rx;
2737	bus_dmamap_t old_map;
2738	int idx;
2739
2740	sc = ss->sc;
2741	ifp = sc->ifp;
2742	rx = &ss->rx_small;
2743	idx = rx->cnt & rx->mask;
2744	rx->cnt++;
2745	/* save a pointer to the received mbuf */
2746	m = rx->info[idx].m;
2747	/* try to replace the received mbuf */
2748	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2749		/* drop the frame -- the old mbuf is re-cycled */
2750		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2751		return;
2752	}
2753
2754	/* unmap the received buffer */
2755	old_map = rx->info[idx].map;
2756	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2757	bus_dmamap_unload(rx->dmat, old_map);
2758
2759	/* swap the bus_dmamap_t's */
2760	rx->info[idx].map = rx->extra_map;
2761	rx->extra_map = old_map;
2762
2763	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2764	 * aligned */
2765	m->m_data += MXGEFW_PAD;
2766
2767	m->m_pkthdr.rcvif = ifp;
2768	m->m_len = m->m_pkthdr.len = len;
2769	ss->ipackets++;
2770	eh = mtod(m, struct ether_header *);
2771	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2772		mxge_vlan_tag_remove(m, &csum);
2773	}
2774	/* flowid only valid if RSS hashing is enabled */
2775	if (sc->num_slices > 1) {
2776		m->m_pkthdr.flowid = (ss - sc->ss);
2777		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2778	}
2779	/* if the checksum is valid, mark it in the mbuf header */
2780	if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2781	    (0 == mxge_rx_csum(m, csum))) {
2782		/* Tell the stack that the  checksum is good */
2783		m->m_pkthdr.csum_data = 0xffff;
2784		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2785			CSUM_DATA_VALID;
2786
2787#if defined(INET) || defined (INET6)
2788		if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2789			return;
2790#endif
2791	}
2792	/* pass the frame up the stack */
2793	(*ifp->if_input)(ifp, m);
2794}
2795
2796static inline void
2797mxge_clean_rx_done(struct mxge_slice_state *ss)
2798{
2799	mxge_rx_done_t *rx_done = &ss->rx_done;
2800	int limit = 0;
2801	uint16_t length;
2802	uint16_t checksum;
2803	int lro;
2804
2805	lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2806	while (rx_done->entry[rx_done->idx].length != 0) {
2807		length = ntohs(rx_done->entry[rx_done->idx].length);
2808		rx_done->entry[rx_done->idx].length = 0;
2809		checksum = rx_done->entry[rx_done->idx].checksum;
2810		if (length <= (MHLEN - MXGEFW_PAD))
2811			mxge_rx_done_small(ss, length, checksum, lro);
2812		else
2813			mxge_rx_done_big(ss, length, checksum, lro);
2814		rx_done->cnt++;
2815		rx_done->idx = rx_done->cnt & rx_done->mask;
2816
2817		/* limit potential for livelock */
2818		if (__predict_false(++limit > rx_done->mask / 2))
2819			break;
2820	}
2821#if defined(INET)  || defined (INET6)
2822	tcp_lro_flush_all(&ss->lc);
2823#endif
2824}
2825
2826
2827static inline void
2828mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2829{
2830	struct ifnet *ifp;
2831	mxge_tx_ring_t *tx;
2832	struct mbuf *m;
2833	bus_dmamap_t map;
2834	int idx;
2835	int *flags;
2836
2837	tx = &ss->tx;
2838	ifp = ss->sc->ifp;
2839	while (tx->pkt_done != mcp_idx) {
2840		idx = tx->done & tx->mask;
2841		tx->done++;
2842		m = tx->info[idx].m;
2843		/* mbuf and DMA map only attached to the first
2844		   segment per-mbuf */
2845		if (m != NULL) {
2846			ss->obytes += m->m_pkthdr.len;
2847			if (m->m_flags & M_MCAST)
2848				ss->omcasts++;
2849			ss->opackets++;
2850			tx->info[idx].m = NULL;
2851			map = tx->info[idx].map;
2852			bus_dmamap_unload(tx->dmat, map);
2853			m_freem(m);
2854		}
2855		if (tx->info[idx].flag) {
2856			tx->info[idx].flag = 0;
2857			tx->pkt_done++;
2858		}
2859	}
2860
2861	/* If we have space, clear IFF_OACTIVE to tell the stack that
2862	   its OK to send packets */
2863#ifdef IFNET_BUF_RING
2864	flags = &ss->if_drv_flags;
2865#else
2866	flags = &ifp->if_drv_flags;
2867#endif
2868	mtx_lock(&ss->tx.mtx);
2869	if ((*flags) & IFF_DRV_OACTIVE &&
2870	    tx->req - tx->done < (tx->mask + 1)/4) {
2871		*(flags) &= ~IFF_DRV_OACTIVE;
2872		ss->tx.wake++;
2873		mxge_start_locked(ss);
2874	}
2875#ifdef IFNET_BUF_RING
2876	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2877		/* let the NIC stop polling this queue, since there
2878		 * are no more transmits pending */
2879		if (tx->req == tx->done) {
2880			*tx->send_stop = 1;
2881			tx->queue_active = 0;
2882			tx->deactivate++;
2883			wmb();
2884		}
2885	}
2886#endif
2887	mtx_unlock(&ss->tx.mtx);
2888
2889}
2890
2891static struct mxge_media_type mxge_xfp_media_types[] =
2892{
2893	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2894	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2895	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2896	{0,		(1 << 5),	"10GBASE-ER"},
2897	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2898	{0,		(1 << 3),	"10GBASE-SW"},
2899	{0,		(1 << 2),	"10GBASE-LW"},
2900	{0,		(1 << 1),	"10GBASE-EW"},
2901	{0,		(1 << 0),	"Reserved"}
2902};
2903static struct mxge_media_type mxge_sfp_media_types[] =
2904{
2905	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2906	{0,		(1 << 7),	"Reserved"},
2907	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2908	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2909	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2910	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2911};
2912
2913static void
2914mxge_media_set(mxge_softc_t *sc, int media_type)
2915{
2916
2917
2918	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2919		    0, NULL);
2920	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2921	sc->current_media = media_type;
2922	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2923}
2924
2925static void
2926mxge_media_init(mxge_softc_t *sc)
2927{
2928	char *ptr;
2929	int i;
2930
2931	ifmedia_removeall(&sc->media);
2932	mxge_media_set(sc, IFM_AUTO);
2933
2934	/*
2935	 * parse the product code to deterimine the interface type
2936	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2937	 * after the 3rd dash in the driver's cached copy of the
2938	 * EEPROM's product code string.
2939	 */
2940	ptr = sc->product_code_string;
2941	if (ptr == NULL) {
2942		device_printf(sc->dev, "Missing product code\n");
2943		return;
2944	}
2945
2946	for (i = 0; i < 3; i++, ptr++) {
2947		ptr = strchr(ptr, '-');
2948		if (ptr == NULL) {
2949			device_printf(sc->dev,
2950				      "only %d dashes in PC?!?\n", i);
2951			return;
2952		}
2953	}
2954	if (*ptr == 'C' || *(ptr +1) == 'C') {
2955		/* -C is CX4 */
2956		sc->connector = MXGE_CX4;
2957		mxge_media_set(sc, IFM_10G_CX4);
2958	} else if (*ptr == 'Q') {
2959		/* -Q is Quad Ribbon Fiber */
2960		sc->connector = MXGE_QRF;
2961		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2962		/* FreeBSD has no media type for Quad ribbon fiber */
2963	} else if (*ptr == 'R') {
2964		/* -R is XFP */
2965		sc->connector = MXGE_XFP;
2966	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2967		/* -S or -2S is SFP+ */
2968		sc->connector = MXGE_SFP;
2969	} else {
2970		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2971	}
2972}
2973
2974/*
2975 * Determine the media type for a NIC.  Some XFPs will identify
2976 * themselves only when their link is up, so this is initiated via a
2977 * link up interrupt.  However, this can potentially take up to
2978 * several milliseconds, so it is run via the watchdog routine, rather
2979 * than in the interrupt handler itself.
2980 */
2981static void
2982mxge_media_probe(mxge_softc_t *sc)
2983{
2984	mxge_cmd_t cmd;
2985	char *cage_type;
2986
2987	struct mxge_media_type *mxge_media_types = NULL;
2988	int i, err, ms, mxge_media_type_entries;
2989	uint32_t byte;
2990
2991	sc->need_media_probe = 0;
2992
2993	if (sc->connector == MXGE_XFP) {
2994		/* -R is XFP */
2995		mxge_media_types = mxge_xfp_media_types;
2996		mxge_media_type_entries =
2997			nitems(mxge_xfp_media_types);
2998		byte = MXGE_XFP_COMPLIANCE_BYTE;
2999		cage_type = "XFP";
3000	} else 	if (sc->connector == MXGE_SFP) {
3001		/* -S or -2S is SFP+ */
3002		mxge_media_types = mxge_sfp_media_types;
3003		mxge_media_type_entries =
3004			nitems(mxge_sfp_media_types);
3005		cage_type = "SFP+";
3006		byte = 3;
3007	} else {
3008		/* nothing to do; media type cannot change */
3009		return;
3010	}
3011
3012	/*
3013	 * At this point we know the NIC has an XFP cage, so now we
3014	 * try to determine what is in the cage by using the
3015	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
3016	 * register.  We read just one byte, which may take over
3017	 * a millisecond
3018	 */
3019
3020	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
3021	cmd.data1 = byte;
3022	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3023	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3024		device_printf(sc->dev, "failed to read XFP\n");
3025	}
3026	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3027		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3028	}
3029	if (err != MXGEFW_CMD_OK) {
3030		return;
3031	}
3032
3033	/* now we wait for the data to be cached */
3034	cmd.data0 = byte;
3035	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3036	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3037		DELAY(1000);
3038		cmd.data0 = byte;
3039		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3040	}
3041	if (err != MXGEFW_CMD_OK) {
3042		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3043			      cage_type, err, ms);
3044		return;
3045	}
3046
3047	if (cmd.data0 == mxge_media_types[0].bitmask) {
3048		if (mxge_verbose)
3049			device_printf(sc->dev, "%s:%s\n", cage_type,
3050				      mxge_media_types[0].name);
3051		if (sc->current_media != mxge_media_types[0].flag) {
3052			mxge_media_init(sc);
3053			mxge_media_set(sc, mxge_media_types[0].flag);
3054		}
3055		return;
3056	}
3057	for (i = 1; i < mxge_media_type_entries; i++) {
3058		if (cmd.data0 & mxge_media_types[i].bitmask) {
3059			if (mxge_verbose)
3060				device_printf(sc->dev, "%s:%s\n",
3061					      cage_type,
3062					      mxge_media_types[i].name);
3063
3064			if (sc->current_media != mxge_media_types[i].flag) {
3065				mxge_media_init(sc);
3066				mxge_media_set(sc, mxge_media_types[i].flag);
3067			}
3068			return;
3069		}
3070	}
3071	if (mxge_verbose)
3072		device_printf(sc->dev, "%s media 0x%x unknown\n",
3073			      cage_type, cmd.data0);
3074
3075	return;
3076}
3077
3078static void
3079mxge_intr(void *arg)
3080{
3081	struct mxge_slice_state *ss = arg;
3082	mxge_softc_t *sc = ss->sc;
3083	mcp_irq_data_t *stats = ss->fw_stats;
3084	mxge_tx_ring_t *tx = &ss->tx;
3085	mxge_rx_done_t *rx_done = &ss->rx_done;
3086	uint32_t send_done_count;
3087	uint8_t valid;
3088
3089
3090#ifndef IFNET_BUF_RING
3091	/* an interrupt on a non-zero slice is implicitly valid
3092	   since MSI-X irqs are not shared */
3093	if (ss != sc->ss) {
3094		mxge_clean_rx_done(ss);
3095		*ss->irq_claim = be32toh(3);
3096		return;
3097	}
3098#endif
3099
3100	/* make sure the DMA has finished */
3101	if (!stats->valid) {
3102		return;
3103	}
3104	valid = stats->valid;
3105
3106	if (sc->legacy_irq) {
3107		/* lower legacy IRQ  */
3108		*sc->irq_deassert = 0;
3109		if (!mxge_deassert_wait)
3110			/* don't wait for conf. that irq is low */
3111			stats->valid = 0;
3112	} else {
3113		stats->valid = 0;
3114	}
3115
3116	/* loop while waiting for legacy irq deassertion */
3117	do {
3118		/* check for transmit completes and receives */
3119		send_done_count = be32toh(stats->send_done_count);
3120		while ((send_done_count != tx->pkt_done) ||
3121		       (rx_done->entry[rx_done->idx].length != 0)) {
3122			if (send_done_count != tx->pkt_done)
3123				mxge_tx_done(ss, (int)send_done_count);
3124			mxge_clean_rx_done(ss);
3125			send_done_count = be32toh(stats->send_done_count);
3126		}
3127		if (sc->legacy_irq && mxge_deassert_wait)
3128			wmb();
3129	} while (*((volatile uint8_t *) &stats->valid));
3130
3131	/* fw link & error stats meaningful only on the first slice */
3132	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3133		if (sc->link_state != stats->link_up) {
3134			sc->link_state = stats->link_up;
3135			if (sc->link_state) {
3136				if_link_state_change(sc->ifp, LINK_STATE_UP);
3137				if (mxge_verbose)
3138					device_printf(sc->dev, "link up\n");
3139			} else {
3140				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3141				if (mxge_verbose)
3142					device_printf(sc->dev, "link down\n");
3143			}
3144			sc->need_media_probe = 1;
3145		}
3146		if (sc->rdma_tags_available !=
3147		    be32toh(stats->rdma_tags_available)) {
3148			sc->rdma_tags_available =
3149				be32toh(stats->rdma_tags_available);
3150			device_printf(sc->dev, "RDMA timed out! %d tags "
3151				      "left\n", sc->rdma_tags_available);
3152		}
3153
3154		if (stats->link_down) {
3155			sc->down_cnt += stats->link_down;
3156			sc->link_state = 0;
3157			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3158		}
3159	}
3160
3161	/* check to see if we have rx token to pass back */
3162	if (valid & 0x1)
3163	    *ss->irq_claim = be32toh(3);
3164	*(ss->irq_claim + 1) = be32toh(3);
3165}
3166
3167static void
3168mxge_init(void *arg)
3169{
3170	mxge_softc_t *sc = arg;
3171	struct ifnet *ifp = sc->ifp;
3172
3173
3174	mtx_lock(&sc->driver_mtx);
3175	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3176		(void) mxge_open(sc);
3177	mtx_unlock(&sc->driver_mtx);
3178}
3179
3180
3181
3182static void
3183mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3184{
3185	int i;
3186
3187#if defined(INET) || defined(INET6)
3188	tcp_lro_free(&ss->lc);
3189#endif
3190	for (i = 0; i <= ss->rx_big.mask; i++) {
3191		if (ss->rx_big.info[i].m == NULL)
3192			continue;
3193		bus_dmamap_unload(ss->rx_big.dmat,
3194				  ss->rx_big.info[i].map);
3195		m_freem(ss->rx_big.info[i].m);
3196		ss->rx_big.info[i].m = NULL;
3197	}
3198
3199	for (i = 0; i <= ss->rx_small.mask; i++) {
3200		if (ss->rx_small.info[i].m == NULL)
3201			continue;
3202		bus_dmamap_unload(ss->rx_small.dmat,
3203				  ss->rx_small.info[i].map);
3204		m_freem(ss->rx_small.info[i].m);
3205		ss->rx_small.info[i].m = NULL;
3206	}
3207
3208	/* transmit ring used only on the first slice */
3209	if (ss->tx.info == NULL)
3210		return;
3211
3212	for (i = 0; i <= ss->tx.mask; i++) {
3213		ss->tx.info[i].flag = 0;
3214		if (ss->tx.info[i].m == NULL)
3215			continue;
3216		bus_dmamap_unload(ss->tx.dmat,
3217				  ss->tx.info[i].map);
3218		m_freem(ss->tx.info[i].m);
3219		ss->tx.info[i].m = NULL;
3220	}
3221}
3222
3223static void
3224mxge_free_mbufs(mxge_softc_t *sc)
3225{
3226	int slice;
3227
3228	for (slice = 0; slice < sc->num_slices; slice++)
3229		mxge_free_slice_mbufs(&sc->ss[slice]);
3230}
3231
3232static void
3233mxge_free_slice_rings(struct mxge_slice_state *ss)
3234{
3235	int i;
3236
3237
3238	if (ss->rx_done.entry != NULL)
3239		mxge_dma_free(&ss->rx_done.dma);
3240	ss->rx_done.entry = NULL;
3241
3242	if (ss->tx.req_bytes != NULL)
3243		free(ss->tx.req_bytes, M_DEVBUF);
3244	ss->tx.req_bytes = NULL;
3245
3246	if (ss->tx.seg_list != NULL)
3247		free(ss->tx.seg_list, M_DEVBUF);
3248	ss->tx.seg_list = NULL;
3249
3250	if (ss->rx_small.shadow != NULL)
3251		free(ss->rx_small.shadow, M_DEVBUF);
3252	ss->rx_small.shadow = NULL;
3253
3254	if (ss->rx_big.shadow != NULL)
3255		free(ss->rx_big.shadow, M_DEVBUF);
3256	ss->rx_big.shadow = NULL;
3257
3258	if (ss->tx.info != NULL) {
3259		if (ss->tx.dmat != NULL) {
3260			for (i = 0; i <= ss->tx.mask; i++) {
3261				bus_dmamap_destroy(ss->tx.dmat,
3262						   ss->tx.info[i].map);
3263			}
3264			bus_dma_tag_destroy(ss->tx.dmat);
3265		}
3266		free(ss->tx.info, M_DEVBUF);
3267	}
3268	ss->tx.info = NULL;
3269
3270	if (ss->rx_small.info != NULL) {
3271		if (ss->rx_small.dmat != NULL) {
3272			for (i = 0; i <= ss->rx_small.mask; i++) {
3273				bus_dmamap_destroy(ss->rx_small.dmat,
3274						   ss->rx_small.info[i].map);
3275			}
3276			bus_dmamap_destroy(ss->rx_small.dmat,
3277					   ss->rx_small.extra_map);
3278			bus_dma_tag_destroy(ss->rx_small.dmat);
3279		}
3280		free(ss->rx_small.info, M_DEVBUF);
3281	}
3282	ss->rx_small.info = NULL;
3283
3284	if (ss->rx_big.info != NULL) {
3285		if (ss->rx_big.dmat != NULL) {
3286			for (i = 0; i <= ss->rx_big.mask; i++) {
3287				bus_dmamap_destroy(ss->rx_big.dmat,
3288						   ss->rx_big.info[i].map);
3289			}
3290			bus_dmamap_destroy(ss->rx_big.dmat,
3291					   ss->rx_big.extra_map);
3292			bus_dma_tag_destroy(ss->rx_big.dmat);
3293		}
3294		free(ss->rx_big.info, M_DEVBUF);
3295	}
3296	ss->rx_big.info = NULL;
3297}
3298
3299static void
3300mxge_free_rings(mxge_softc_t *sc)
3301{
3302	int slice;
3303
3304	for (slice = 0; slice < sc->num_slices; slice++)
3305		mxge_free_slice_rings(&sc->ss[slice]);
3306}
3307
3308static int
3309mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3310		       int tx_ring_entries)
3311{
3312	mxge_softc_t *sc = ss->sc;
3313	size_t bytes;
3314	int err, i;
3315
3316	/* allocate per-slice receive resources */
3317
3318	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3319	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3320
3321	/* allocate the rx shadow rings */
3322	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3323	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3324
3325	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3326	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3327
3328	/* allocate the rx host info rings */
3329	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3330	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3331
3332	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3333	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3334
3335	/* allocate the rx busdma resources */
3336	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3337				 1,			/* alignment */
3338				 4096,			/* boundary */
3339				 BUS_SPACE_MAXADDR,	/* low */
3340				 BUS_SPACE_MAXADDR,	/* high */
3341				 NULL, NULL,		/* filter */
3342				 MHLEN,			/* maxsize */
3343				 1,			/* num segs */
3344				 MHLEN,			/* maxsegsize */
3345				 BUS_DMA_ALLOCNOW,	/* flags */
3346				 NULL, NULL,		/* lock */
3347				 &ss->rx_small.dmat);	/* tag */
3348	if (err != 0) {
3349		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3350			      err);
3351		return err;
3352	}
3353
3354	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3355				 1,			/* alignment */
3356#if MXGE_VIRT_JUMBOS
3357				 4096,			/* boundary */
3358#else
3359				 0,			/* boundary */
3360#endif
3361				 BUS_SPACE_MAXADDR,	/* low */
3362				 BUS_SPACE_MAXADDR,	/* high */
3363				 NULL, NULL,		/* filter */
3364				 3*4096,		/* maxsize */
3365#if MXGE_VIRT_JUMBOS
3366				 3,			/* num segs */
3367				 4096,			/* maxsegsize*/
3368#else
3369				 1,			/* num segs */
3370				 MJUM9BYTES,		/* maxsegsize*/
3371#endif
3372				 BUS_DMA_ALLOCNOW,	/* flags */
3373				 NULL, NULL,		/* lock */
3374				 &ss->rx_big.dmat);	/* tag */
3375	if (err != 0) {
3376		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3377			      err);
3378		return err;
3379	}
3380	for (i = 0; i <= ss->rx_small.mask; i++) {
3381		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3382					&ss->rx_small.info[i].map);
3383		if (err != 0) {
3384			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3385				      err);
3386			return err;
3387		}
3388	}
3389	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3390				&ss->rx_small.extra_map);
3391	if (err != 0) {
3392		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3393			      err);
3394		return err;
3395	}
3396
3397	for (i = 0; i <= ss->rx_big.mask; i++) {
3398		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3399					&ss->rx_big.info[i].map);
3400		if (err != 0) {
3401			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3402				      err);
3403			return err;
3404		}
3405	}
3406	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3407				&ss->rx_big.extra_map);
3408	if (err != 0) {
3409		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3410			      err);
3411		return err;
3412	}
3413
3414	/* now allocate TX resources */
3415
3416#ifndef IFNET_BUF_RING
3417	/* only use a single TX ring for now */
3418	if (ss != ss->sc->ss)
3419		return 0;
3420#endif
3421
3422	ss->tx.mask = tx_ring_entries - 1;
3423	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3424
3425
3426	/* allocate the tx request copy block */
3427	bytes = 8 +
3428		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3429	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3430	/* ensure req_list entries are aligned to 8 bytes */
3431	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3432		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3433
3434	/* allocate the tx busdma segment list */
3435	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3436	ss->tx.seg_list = (bus_dma_segment_t *)
3437		malloc(bytes, M_DEVBUF, M_WAITOK);
3438
3439	/* allocate the tx host info ring */
3440	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3441	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3442
3443	/* allocate the tx busdma resources */
3444	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3445				 1,			/* alignment */
3446				 sc->tx_boundary,	/* boundary */
3447				 BUS_SPACE_MAXADDR,	/* low */
3448				 BUS_SPACE_MAXADDR,	/* high */
3449				 NULL, NULL,		/* filter */
3450				 65536 + 256,		/* maxsize */
3451				 ss->tx.max_desc - 2,	/* num segs */
3452				 sc->tx_boundary,	/* maxsegsz */
3453				 BUS_DMA_ALLOCNOW,	/* flags */
3454				 NULL, NULL,		/* lock */
3455				 &ss->tx.dmat);		/* tag */
3456
3457	if (err != 0) {
3458		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3459			      err);
3460		return err;
3461	}
3462
3463	/* now use these tags to setup dmamaps for each slot
3464	   in the ring */
3465	for (i = 0; i <= ss->tx.mask; i++) {
3466		err = bus_dmamap_create(ss->tx.dmat, 0,
3467					&ss->tx.info[i].map);
3468		if (err != 0) {
3469			device_printf(sc->dev, "Err %d  tx dmamap\n",
3470				      err);
3471			return err;
3472		}
3473	}
3474	return 0;
3475
3476}
3477
3478static int
3479mxge_alloc_rings(mxge_softc_t *sc)
3480{
3481	mxge_cmd_t cmd;
3482	int tx_ring_size;
3483	int tx_ring_entries, rx_ring_entries;
3484	int err, slice;
3485
3486	/* get ring sizes */
3487	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3488	tx_ring_size = cmd.data0;
3489	if (err != 0) {
3490		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3491		goto abort;
3492	}
3493
3494	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3495	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3496	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3497	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3498	IFQ_SET_READY(&sc->ifp->if_snd);
3499
3500	for (slice = 0; slice < sc->num_slices; slice++) {
3501		err = mxge_alloc_slice_rings(&sc->ss[slice],
3502					     rx_ring_entries,
3503					     tx_ring_entries);
3504		if (err != 0)
3505			goto abort;
3506	}
3507	return 0;
3508
3509abort:
3510	mxge_free_rings(sc);
3511	return err;
3512
3513}
3514
3515
3516static void
3517mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3518{
3519	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3520
3521	if (bufsize < MCLBYTES) {
3522		/* easy, everything fits in a single buffer */
3523		*big_buf_size = MCLBYTES;
3524		*cl_size = MCLBYTES;
3525		*nbufs = 1;
3526		return;
3527	}
3528
3529	if (bufsize < MJUMPAGESIZE) {
3530		/* still easy, everything still fits in a single buffer */
3531		*big_buf_size = MJUMPAGESIZE;
3532		*cl_size = MJUMPAGESIZE;
3533		*nbufs = 1;
3534		return;
3535	}
3536#if MXGE_VIRT_JUMBOS
3537	/* now we need to use virtually contiguous buffers */
3538	*cl_size = MJUM9BYTES;
3539	*big_buf_size = 4096;
3540	*nbufs = mtu / 4096 + 1;
3541	/* needs to be a power of two, so round up */
3542	if (*nbufs == 3)
3543		*nbufs = 4;
3544#else
3545	*cl_size = MJUM9BYTES;
3546	*big_buf_size = MJUM9BYTES;
3547	*nbufs = 1;
3548#endif
3549}
3550
3551static int
3552mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3553{
3554	mxge_softc_t *sc;
3555	mxge_cmd_t cmd;
3556	bus_dmamap_t map;
3557	int err, i, slice;
3558
3559
3560	sc = ss->sc;
3561	slice = ss - sc->ss;
3562
3563#if defined(INET) || defined(INET6)
3564	(void)tcp_lro_init(&ss->lc);
3565#endif
3566	ss->lc.ifp = sc->ifp;
3567
3568	/* get the lanai pointers to the send and receive rings */
3569
3570	err = 0;
3571#ifndef IFNET_BUF_RING
3572	/* We currently only send from the first slice */
3573	if (slice == 0) {
3574#endif
3575		cmd.data0 = slice;
3576		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3577		ss->tx.lanai =
3578			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3579		ss->tx.send_go = (volatile uint32_t *)
3580			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3581		ss->tx.send_stop = (volatile uint32_t *)
3582		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3583#ifndef IFNET_BUF_RING
3584	}
3585#endif
3586	cmd.data0 = slice;
3587	err |= mxge_send_cmd(sc,
3588			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3589	ss->rx_small.lanai =
3590		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3591	cmd.data0 = slice;
3592	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3593	ss->rx_big.lanai =
3594		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3595
3596	if (err != 0) {
3597		device_printf(sc->dev,
3598			      "failed to get ring sizes or locations\n");
3599		return EIO;
3600	}
3601
3602	/* stock receive rings */
3603	for (i = 0; i <= ss->rx_small.mask; i++) {
3604		map = ss->rx_small.info[i].map;
3605		err = mxge_get_buf_small(ss, map, i);
3606		if (err) {
3607			device_printf(sc->dev, "alloced %d/%d smalls\n",
3608				      i, ss->rx_small.mask + 1);
3609			return ENOMEM;
3610		}
3611	}
3612	for (i = 0; i <= ss->rx_big.mask; i++) {
3613		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3614		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3615	}
3616	ss->rx_big.nbufs = nbufs;
3617	ss->rx_big.cl_size = cl_size;
3618	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3619		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3620	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3621		map = ss->rx_big.info[i].map;
3622		err = mxge_get_buf_big(ss, map, i);
3623		if (err) {
3624			device_printf(sc->dev, "alloced %d/%d bigs\n",
3625				      i, ss->rx_big.mask + 1);
3626			return ENOMEM;
3627		}
3628	}
3629	return 0;
3630}
3631
3632static int
3633mxge_open(mxge_softc_t *sc)
3634{
3635	mxge_cmd_t cmd;
3636	int err, big_bytes, nbufs, slice, cl_size, i;
3637	bus_addr_t bus;
3638	volatile uint8_t *itable;
3639	struct mxge_slice_state *ss;
3640
3641	/* Copy the MAC address in case it was overridden */
3642	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3643
3644	err = mxge_reset(sc, 1);
3645	if (err != 0) {
3646		device_printf(sc->dev, "failed to reset\n");
3647		return EIO;
3648	}
3649
3650	if (sc->num_slices > 1) {
3651		/* setup the indirection table */
3652		cmd.data0 = sc->num_slices;
3653		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3654				    &cmd);
3655
3656		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3657				     &cmd);
3658		if (err != 0) {
3659			device_printf(sc->dev,
3660				      "failed to setup rss tables\n");
3661			return err;
3662		}
3663
3664		/* just enable an identity mapping */
3665		itable = sc->sram + cmd.data0;
3666		for (i = 0; i < sc->num_slices; i++)
3667			itable[i] = (uint8_t)i;
3668
3669		cmd.data0 = 1;
3670		cmd.data1 = mxge_rss_hash_type;
3671		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3672		if (err != 0) {
3673			device_printf(sc->dev, "failed to enable slices\n");
3674			return err;
3675		}
3676	}
3677
3678
3679	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3680
3681	cmd.data0 = nbufs;
3682	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3683			    &cmd);
3684	/* error is only meaningful if we're trying to set
3685	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3686	if (err && nbufs > 1) {
3687		device_printf(sc->dev,
3688			      "Failed to set alway-use-n to %d\n",
3689			      nbufs);
3690		return EIO;
3691	}
3692	/* Give the firmware the mtu and the big and small buffer
3693	   sizes.  The firmware wants the big buf size to be a power
3694	   of two. Luckily, FreeBSD's clusters are powers of two */
3695	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3696	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3697	cmd.data0 = MHLEN - MXGEFW_PAD;
3698	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3699			     &cmd);
3700	cmd.data0 = big_bytes;
3701	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3702
3703	if (err != 0) {
3704		device_printf(sc->dev, "failed to setup params\n");
3705		goto abort;
3706	}
3707
3708	/* Now give him the pointer to the stats block */
3709	for (slice = 0;
3710#ifdef IFNET_BUF_RING
3711	     slice < sc->num_slices;
3712#else
3713	     slice < 1;
3714#endif
3715	     slice++) {
3716		ss = &sc->ss[slice];
3717		cmd.data0 =
3718			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3719		cmd.data1 =
3720			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3721		cmd.data2 = sizeof(struct mcp_irq_data);
3722		cmd.data2 |= (slice << 16);
3723		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3724	}
3725
3726	if (err != 0) {
3727		bus = sc->ss->fw_stats_dma.bus_addr;
3728		bus += offsetof(struct mcp_irq_data, send_done_count);
3729		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3730		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3731		err = mxge_send_cmd(sc,
3732				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3733				    &cmd);
3734		/* Firmware cannot support multicast without STATS_DMA_V2 */
3735		sc->fw_multicast_support = 0;
3736	} else {
3737		sc->fw_multicast_support = 1;
3738	}
3739
3740	if (err != 0) {
3741		device_printf(sc->dev, "failed to setup params\n");
3742		goto abort;
3743	}
3744
3745	for (slice = 0; slice < sc->num_slices; slice++) {
3746		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3747		if (err != 0) {
3748			device_printf(sc->dev, "couldn't open slice %d\n",
3749				      slice);
3750			goto abort;
3751		}
3752	}
3753
3754	/* Finally, start the firmware running */
3755	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3756	if (err) {
3757		device_printf(sc->dev, "Couldn't bring up link\n");
3758		goto abort;
3759	}
3760#ifdef IFNET_BUF_RING
3761	for (slice = 0; slice < sc->num_slices; slice++) {
3762		ss = &sc->ss[slice];
3763		ss->if_drv_flags |= IFF_DRV_RUNNING;
3764		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3765	}
3766#endif
3767	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3768	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3769
3770	return 0;
3771
3772
3773abort:
3774	mxge_free_mbufs(sc);
3775
3776	return err;
3777}
3778
3779static int
3780mxge_close(mxge_softc_t *sc, int down)
3781{
3782	mxge_cmd_t cmd;
3783	int err, old_down_cnt;
3784#ifdef IFNET_BUF_RING
3785	struct mxge_slice_state *ss;
3786	int slice;
3787#endif
3788
3789#ifdef IFNET_BUF_RING
3790	for (slice = 0; slice < sc->num_slices; slice++) {
3791		ss = &sc->ss[slice];
3792		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3793	}
3794#endif
3795	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3796	if (!down) {
3797		old_down_cnt = sc->down_cnt;
3798		wmb();
3799		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3800		if (err) {
3801			device_printf(sc->dev,
3802				      "Couldn't bring down link\n");
3803		}
3804		if (old_down_cnt == sc->down_cnt) {
3805			/* wait for down irq */
3806			DELAY(10 * sc->intr_coal_delay);
3807		}
3808		wmb();
3809		if (old_down_cnt == sc->down_cnt) {
3810			device_printf(sc->dev, "never got down irq\n");
3811		}
3812	}
3813	mxge_free_mbufs(sc);
3814
3815	return 0;
3816}
3817
3818static void
3819mxge_setup_cfg_space(mxge_softc_t *sc)
3820{
3821	device_t dev = sc->dev;
3822	int reg;
3823	uint16_t lnk, pectl;
3824
3825	/* find the PCIe link width and set max read request to 4KB*/
3826	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3827		lnk = pci_read_config(dev, reg + 0x12, 2);
3828		sc->link_width = (lnk >> 4) & 0x3f;
3829
3830		if (sc->pectl == 0) {
3831			pectl = pci_read_config(dev, reg + 0x8, 2);
3832			pectl = (pectl & ~0x7000) | (5 << 12);
3833			pci_write_config(dev, reg + 0x8, pectl, 2);
3834			sc->pectl = pectl;
3835		} else {
3836			/* restore saved pectl after watchdog reset */
3837			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3838		}
3839	}
3840
3841	/* Enable DMA and Memory space access */
3842	pci_enable_busmaster(dev);
3843}
3844
3845static uint32_t
3846mxge_read_reboot(mxge_softc_t *sc)
3847{
3848	device_t dev = sc->dev;
3849	uint32_t vs;
3850
3851	/* find the vendor specific offset */
3852	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3853		device_printf(sc->dev,
3854			      "could not find vendor specific offset\n");
3855		return (uint32_t)-1;
3856	}
3857	/* enable read32 mode */
3858	pci_write_config(dev, vs + 0x10, 0x3, 1);
3859	/* tell NIC which register to read */
3860	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3861	return (pci_read_config(dev, vs + 0x14, 4));
3862}
3863
3864static void
3865mxge_watchdog_reset(mxge_softc_t *sc)
3866{
3867	struct pci_devinfo *dinfo;
3868	struct mxge_slice_state *ss;
3869	int err, running, s, num_tx_slices = 1;
3870	uint32_t reboot;
3871	uint16_t cmd;
3872
3873	err = ENXIO;
3874
3875	device_printf(sc->dev, "Watchdog reset!\n");
3876
3877	/*
3878	 * check to see if the NIC rebooted.  If it did, then all of
3879	 * PCI config space has been reset, and things like the
3880	 * busmaster bit will be zero.  If this is the case, then we
3881	 * must restore PCI config space before the NIC can be used
3882	 * again
3883	 */
3884	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3885	if (cmd == 0xffff) {
3886		/*
3887		 * maybe the watchdog caught the NIC rebooting; wait
3888		 * up to 100ms for it to finish.  If it does not come
3889		 * back, then give up
3890		 */
3891		DELAY(1000*100);
3892		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3893		if (cmd == 0xffff) {
3894			device_printf(sc->dev, "NIC disappeared!\n");
3895		}
3896	}
3897	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3898		/* print the reboot status */
3899		reboot = mxge_read_reboot(sc);
3900		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3901			      reboot);
3902		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3903		if (running) {
3904
3905			/*
3906			 * quiesce NIC so that TX routines will not try to
3907			 * xmit after restoration of BAR
3908			 */
3909
3910			/* Mark the link as down */
3911			if (sc->link_state) {
3912				sc->link_state = 0;
3913				if_link_state_change(sc->ifp,
3914						     LINK_STATE_DOWN);
3915			}
3916#ifdef IFNET_BUF_RING
3917			num_tx_slices = sc->num_slices;
3918#endif
3919			/* grab all TX locks to ensure no tx  */
3920			for (s = 0; s < num_tx_slices; s++) {
3921				ss = &sc->ss[s];
3922				mtx_lock(&ss->tx.mtx);
3923			}
3924			mxge_close(sc, 1);
3925		}
3926		/* restore PCI configuration space */
3927		dinfo = device_get_ivars(sc->dev);
3928		pci_cfg_restore(sc->dev, dinfo);
3929
3930		/* and redo any changes we made to our config space */
3931		mxge_setup_cfg_space(sc);
3932
3933		/* reload f/w */
3934		err = mxge_load_firmware(sc, 0);
3935		if (err) {
3936			device_printf(sc->dev,
3937				      "Unable to re-load f/w\n");
3938		}
3939		if (running) {
3940			if (!err)
3941				err = mxge_open(sc);
3942			/* release all TX locks */
3943			for (s = 0; s < num_tx_slices; s++) {
3944				ss = &sc->ss[s];
3945#ifdef IFNET_BUF_RING
3946				mxge_start_locked(ss);
3947#endif
3948				mtx_unlock(&ss->tx.mtx);
3949			}
3950		}
3951		sc->watchdog_resets++;
3952	} else {
3953		device_printf(sc->dev,
3954			      "NIC did not reboot, not resetting\n");
3955		err = 0;
3956	}
3957	if (err) {
3958		device_printf(sc->dev, "watchdog reset failed\n");
3959	} else {
3960		if (sc->dying == 2)
3961			sc->dying = 0;
3962		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3963	}
3964}
3965
3966static void
3967mxge_watchdog_task(void *arg, int pending)
3968{
3969	mxge_softc_t *sc = arg;
3970
3971
3972	mtx_lock(&sc->driver_mtx);
3973	mxge_watchdog_reset(sc);
3974	mtx_unlock(&sc->driver_mtx);
3975}
3976
3977static void
3978mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3979{
3980	tx = &sc->ss[slice].tx;
3981	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3982	device_printf(sc->dev,
3983		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3984		      tx->req, tx->done, tx->queue_active);
3985	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3986			      tx->activate, tx->deactivate);
3987	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3988		      tx->pkt_done,
3989		      be32toh(sc->ss->fw_stats->send_done_count));
3990}
3991
3992static int
3993mxge_watchdog(mxge_softc_t *sc)
3994{
3995	mxge_tx_ring_t *tx;
3996	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3997	int i, err = 0;
3998
3999	/* see if we have outstanding transmits, which
4000	   have been pending for more than mxge_ticks */
4001	for (i = 0;
4002#ifdef IFNET_BUF_RING
4003	     (i < sc->num_slices) && (err == 0);
4004#else
4005	     (i < 1) && (err == 0);
4006#endif
4007	     i++) {
4008		tx = &sc->ss[i].tx;
4009		if (tx->req != tx->done &&
4010		    tx->watchdog_req != tx->watchdog_done &&
4011		    tx->done == tx->watchdog_done) {
4012			/* check for pause blocking before resetting */
4013			if (tx->watchdog_rx_pause == rx_pause) {
4014				mxge_warn_stuck(sc, tx, i);
4015				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4016				return (ENXIO);
4017			}
4018			else
4019				device_printf(sc->dev, "Flow control blocking "
4020					      "xmits, check link partner\n");
4021		}
4022
4023		tx->watchdog_req = tx->req;
4024		tx->watchdog_done = tx->done;
4025		tx->watchdog_rx_pause = rx_pause;
4026	}
4027
4028	if (sc->need_media_probe)
4029		mxge_media_probe(sc);
4030	return (err);
4031}
4032
4033static uint64_t
4034mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
4035{
4036	struct mxge_softc *sc;
4037	uint64_t rv;
4038
4039	sc = if_getsoftc(ifp);
4040	rv = 0;
4041
4042	switch (cnt) {
4043	case IFCOUNTER_IPACKETS:
4044		for (int s = 0; s < sc->num_slices; s++)
4045			rv += sc->ss[s].ipackets;
4046		return (rv);
4047	case IFCOUNTER_OPACKETS:
4048		for (int s = 0; s < sc->num_slices; s++)
4049			rv += sc->ss[s].opackets;
4050		return (rv);
4051	case IFCOUNTER_OERRORS:
4052		for (int s = 0; s < sc->num_slices; s++)
4053			rv += sc->ss[s].oerrors;
4054		return (rv);
4055#ifdef IFNET_BUF_RING
4056	case IFCOUNTER_OBYTES:
4057		for (int s = 0; s < sc->num_slices; s++)
4058			rv += sc->ss[s].obytes;
4059		return (rv);
4060	case IFCOUNTER_OMCASTS:
4061		for (int s = 0; s < sc->num_slices; s++)
4062			rv += sc->ss[s].omcasts;
4063		return (rv);
4064	case IFCOUNTER_OQDROPS:
4065		for (int s = 0; s < sc->num_slices; s++)
4066			rv += sc->ss[s].tx.br->br_drops;
4067		return (rv);
4068#endif
4069	default:
4070		return (if_get_counter_default(ifp, cnt));
4071	}
4072}
4073
4074static void
4075mxge_tick(void *arg)
4076{
4077	mxge_softc_t *sc = arg;
4078	u_long pkts = 0;
4079	int err = 0;
4080	int running, ticks;
4081	uint16_t cmd;
4082
4083	ticks = mxge_ticks;
4084	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4085	if (running) {
4086		if (!sc->watchdog_countdown) {
4087			err = mxge_watchdog(sc);
4088			sc->watchdog_countdown = 4;
4089		}
4090		sc->watchdog_countdown--;
4091	}
4092	if (pkts == 0) {
4093		/* ensure NIC did not suffer h/w fault while idle */
4094		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4095		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4096			sc->dying = 2;
4097			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4098			err = ENXIO;
4099		}
4100		/* look less often if NIC is idle */
4101		ticks *= 4;
4102	}
4103
4104	if (err == 0)
4105		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4106
4107}
4108
4109static int
4110mxge_media_change(struct ifnet *ifp)
4111{
4112	return EINVAL;
4113}
4114
4115static int
4116mxge_change_mtu(mxge_softc_t *sc, int mtu)
4117{
4118	struct ifnet *ifp = sc->ifp;
4119	int real_mtu, old_mtu;
4120	int err = 0;
4121
4122
4123	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4124	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4125		return EINVAL;
4126	mtx_lock(&sc->driver_mtx);
4127	old_mtu = ifp->if_mtu;
4128	ifp->if_mtu = mtu;
4129	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4130		mxge_close(sc, 0);
4131		err = mxge_open(sc);
4132		if (err != 0) {
4133			ifp->if_mtu = old_mtu;
4134			mxge_close(sc, 0);
4135			(void) mxge_open(sc);
4136		}
4137	}
4138	mtx_unlock(&sc->driver_mtx);
4139	return err;
4140}
4141
4142static void
4143mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4144{
4145	mxge_softc_t *sc = ifp->if_softc;
4146
4147
4148	if (sc == NULL)
4149		return;
4150	ifmr->ifm_status = IFM_AVALID;
4151	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4152	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4153	ifmr->ifm_active |= sc->current_media;
4154}
4155
4156static int
4157mxge_fetch_i2c(mxge_softc_t *sc, struct ifi2creq *i2c)
4158{
4159	mxge_cmd_t cmd;
4160	uint32_t i2c_args;
4161	int i, ms, err;
4162
4163
4164	if (i2c->dev_addr != 0xA0 &&
4165	    i2c->dev_addr != 0xA2)
4166		return (EINVAL);
4167	if (i2c->len > sizeof(i2c->data))
4168		return (EINVAL);
4169
4170	for (i = 0; i < i2c->len; i++) {
4171		i2c_args = i2c->dev_addr << 0x8;
4172		i2c_args |= i2c->offset + i;
4173		cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
4174		cmd.data1 = i2c_args;
4175		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
4176
4177		if (err != MXGEFW_CMD_OK)
4178			return (EIO);
4179		/* now we wait for the data to be cached */
4180		cmd.data0 = i2c_args & 0xff;
4181		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4182		for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
4183			cmd.data0 = i2c_args & 0xff;
4184			err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
4185			if (err == EBUSY)
4186				DELAY(1000);
4187		}
4188		if (err != MXGEFW_CMD_OK)
4189			return (EIO);
4190		i2c->data[i] = cmd.data0;
4191	}
4192	return (0);
4193}
4194
4195static int
4196mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4197{
4198	mxge_softc_t *sc = ifp->if_softc;
4199	struct ifreq *ifr = (struct ifreq *)data;
4200	struct ifi2creq i2c;
4201	int err, mask;
4202
4203	err = 0;
4204	switch (command) {
4205	case SIOCSIFMTU:
4206		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4207		break;
4208
4209	case SIOCSIFFLAGS:
4210		mtx_lock(&sc->driver_mtx);
4211		if (sc->dying) {
4212			mtx_unlock(&sc->driver_mtx);
4213			return EINVAL;
4214		}
4215		if (ifp->if_flags & IFF_UP) {
4216			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4217				err = mxge_open(sc);
4218			} else {
4219				/* take care of promis can allmulti
4220				   flag chages */
4221				mxge_change_promisc(sc,
4222						    ifp->if_flags & IFF_PROMISC);
4223				mxge_set_multicast_list(sc);
4224			}
4225		} else {
4226			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4227				mxge_close(sc, 0);
4228			}
4229		}
4230		mtx_unlock(&sc->driver_mtx);
4231		break;
4232
4233	case SIOCADDMULTI:
4234	case SIOCDELMULTI:
4235		mtx_lock(&sc->driver_mtx);
4236		if (sc->dying) {
4237			mtx_unlock(&sc->driver_mtx);
4238			return (EINVAL);
4239		}
4240		mxge_set_multicast_list(sc);
4241		mtx_unlock(&sc->driver_mtx);
4242		break;
4243
4244	case SIOCSIFCAP:
4245		mtx_lock(&sc->driver_mtx);
4246		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4247		if (mask & IFCAP_TXCSUM) {
4248			if (IFCAP_TXCSUM & ifp->if_capenable) {
4249				mask &= ~IFCAP_TSO4;
4250				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4251				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4252			} else {
4253				ifp->if_capenable |= IFCAP_TXCSUM;
4254				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4255			}
4256		}
4257		if (mask & IFCAP_RXCSUM) {
4258			if (IFCAP_RXCSUM & ifp->if_capenable) {
4259				ifp->if_capenable &= ~IFCAP_RXCSUM;
4260			} else {
4261				ifp->if_capenable |= IFCAP_RXCSUM;
4262			}
4263		}
4264		if (mask & IFCAP_TSO4) {
4265			if (IFCAP_TSO4 & ifp->if_capenable) {
4266				ifp->if_capenable &= ~IFCAP_TSO4;
4267			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4268				ifp->if_capenable |= IFCAP_TSO4;
4269				ifp->if_hwassist |= CSUM_TSO;
4270			} else {
4271				printf("mxge requires tx checksum offload"
4272				       " be enabled to use TSO\n");
4273				err = EINVAL;
4274			}
4275		}
4276#if IFCAP_TSO6
4277		if (mask & IFCAP_TXCSUM_IPV6) {
4278			if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4279				mask &= ~IFCAP_TSO6;
4280				ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4281						       | IFCAP_TSO6);
4282				ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4283						      | CSUM_UDP);
4284			} else {
4285				ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4286				ifp->if_hwassist |= (CSUM_TCP_IPV6
4287						     | CSUM_UDP_IPV6);
4288			}
4289		}
4290		if (mask & IFCAP_RXCSUM_IPV6) {
4291			if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4292				ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4293			} else {
4294				ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4295			}
4296		}
4297		if (mask & IFCAP_TSO6) {
4298			if (IFCAP_TSO6 & ifp->if_capenable) {
4299				ifp->if_capenable &= ~IFCAP_TSO6;
4300			} else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4301				ifp->if_capenable |= IFCAP_TSO6;
4302				ifp->if_hwassist |= CSUM_TSO;
4303			} else {
4304				printf("mxge requires tx checksum offload"
4305				       " be enabled to use TSO\n");
4306				err = EINVAL;
4307			}
4308		}
4309#endif /*IFCAP_TSO6 */
4310
4311		if (mask & IFCAP_LRO)
4312			ifp->if_capenable ^= IFCAP_LRO;
4313		if (mask & IFCAP_VLAN_HWTAGGING)
4314			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4315		if (mask & IFCAP_VLAN_HWTSO)
4316			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4317
4318		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4319		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4320			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4321
4322		mtx_unlock(&sc->driver_mtx);
4323		VLAN_CAPABILITIES(ifp);
4324
4325		break;
4326
4327	case SIOCGIFMEDIA:
4328		mtx_lock(&sc->driver_mtx);
4329		if (sc->dying) {
4330			mtx_unlock(&sc->driver_mtx);
4331			return (EINVAL);
4332		}
4333		mxge_media_probe(sc);
4334		mtx_unlock(&sc->driver_mtx);
4335		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4336				    &sc->media, command);
4337		break;
4338
4339	case SIOCGI2C:
4340		if (sc->connector != MXGE_XFP &&
4341		    sc->connector != MXGE_SFP) {
4342			err = ENXIO;
4343			break;
4344		}
4345		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4346		if (err != 0)
4347			break;
4348		mtx_lock(&sc->driver_mtx);
4349		if (sc->dying) {
4350			mtx_unlock(&sc->driver_mtx);
4351			return (EINVAL);
4352		}
4353		err = mxge_fetch_i2c(sc, &i2c);
4354		mtx_unlock(&sc->driver_mtx);
4355		if (err == 0)
4356			err = copyout(&i2c, ifr_data_get_ptr(ifr),
4357			    sizeof(i2c));
4358		break;
4359	default:
4360		err = ether_ioctl(ifp, command, data);
4361		break;
4362	}
4363	return err;
4364}
4365
4366static void
4367mxge_fetch_tunables(mxge_softc_t *sc)
4368{
4369
4370	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4371	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4372			  &mxge_flow_control);
4373	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4374			  &mxge_intr_coal_delay);
4375	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4376			  &mxge_nvidia_ecrc_enable);
4377	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4378			  &mxge_force_firmware);
4379	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4380			  &mxge_deassert_wait);
4381	TUNABLE_INT_FETCH("hw.mxge.verbose",
4382			  &mxge_verbose);
4383	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4384	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4385	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4386	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4387	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4388	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4389
4390	if (bootverbose)
4391		mxge_verbose = 1;
4392	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4393		mxge_intr_coal_delay = 30;
4394	if (mxge_ticks == 0)
4395		mxge_ticks = hz / 2;
4396	sc->pause = mxge_flow_control;
4397	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4398	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4399		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4400	}
4401	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4402	    mxge_initial_mtu < ETHER_MIN_LEN)
4403		mxge_initial_mtu = ETHERMTU_JUMBO;
4404
4405	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4406		mxge_throttle = MXGE_MAX_THROTTLE;
4407	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4408		mxge_throttle = MXGE_MIN_THROTTLE;
4409	sc->throttle = mxge_throttle;
4410}
4411
4412
4413static void
4414mxge_free_slices(mxge_softc_t *sc)
4415{
4416	struct mxge_slice_state *ss;
4417	int i;
4418
4419
4420	if (sc->ss == NULL)
4421		return;
4422
4423	for (i = 0; i < sc->num_slices; i++) {
4424		ss = &sc->ss[i];
4425		if (ss->fw_stats != NULL) {
4426			mxge_dma_free(&ss->fw_stats_dma);
4427			ss->fw_stats = NULL;
4428#ifdef IFNET_BUF_RING
4429			if (ss->tx.br != NULL) {
4430				drbr_free(ss->tx.br, M_DEVBUF);
4431				ss->tx.br = NULL;
4432			}
4433#endif
4434			mtx_destroy(&ss->tx.mtx);
4435		}
4436		if (ss->rx_done.entry != NULL) {
4437			mxge_dma_free(&ss->rx_done.dma);
4438			ss->rx_done.entry = NULL;
4439		}
4440	}
4441	free(sc->ss, M_DEVBUF);
4442	sc->ss = NULL;
4443}
4444
4445static int
4446mxge_alloc_slices(mxge_softc_t *sc)
4447{
4448	mxge_cmd_t cmd;
4449	struct mxge_slice_state *ss;
4450	size_t bytes;
4451	int err, i, max_intr_slots;
4452
4453	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4454	if (err != 0) {
4455		device_printf(sc->dev, "Cannot determine rx ring size\n");
4456		return err;
4457	}
4458	sc->rx_ring_size = cmd.data0;
4459	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4460
4461	bytes = sizeof (*sc->ss) * sc->num_slices;
4462	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4463	if (sc->ss == NULL)
4464		return (ENOMEM);
4465	for (i = 0; i < sc->num_slices; i++) {
4466		ss = &sc->ss[i];
4467
4468		ss->sc = sc;
4469
4470		/* allocate per-slice rx interrupt queues */
4471
4472		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4473		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4474		if (err != 0)
4475			goto abort;
4476		ss->rx_done.entry = ss->rx_done.dma.addr;
4477		bzero(ss->rx_done.entry, bytes);
4478
4479		/*
4480		 * allocate the per-slice firmware stats; stats
4481		 * (including tx) are used used only on the first
4482		 * slice for now
4483		 */
4484#ifndef IFNET_BUF_RING
4485		if (i > 0)
4486			continue;
4487#endif
4488
4489		bytes = sizeof (*ss->fw_stats);
4490		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4491				     sizeof (*ss->fw_stats), 64);
4492		if (err != 0)
4493			goto abort;
4494		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4495		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4496			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4497		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4498#ifdef IFNET_BUF_RING
4499		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4500					   &ss->tx.mtx);
4501#endif
4502	}
4503
4504	return (0);
4505
4506abort:
4507	mxge_free_slices(sc);
4508	return (ENOMEM);
4509}
4510
4511static void
4512mxge_slice_probe(mxge_softc_t *sc)
4513{
4514	mxge_cmd_t cmd;
4515	char *old_fw;
4516	int msix_cnt, status, max_intr_slots;
4517
4518	sc->num_slices = 1;
4519	/*
4520	 *  don't enable multiple slices if they are not enabled,
4521	 *  or if this is not an SMP system
4522	 */
4523
4524	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4525		return;
4526
4527	/* see how many MSI-X interrupts are available */
4528	msix_cnt = pci_msix_count(sc->dev);
4529	if (msix_cnt < 2)
4530		return;
4531
4532	/* now load the slice aware firmware see what it supports */
4533	old_fw = sc->fw_name;
4534	if (old_fw == mxge_fw_aligned)
4535		sc->fw_name = mxge_fw_rss_aligned;
4536	else
4537		sc->fw_name = mxge_fw_rss_unaligned;
4538	status = mxge_load_firmware(sc, 0);
4539	if (status != 0) {
4540		device_printf(sc->dev, "Falling back to a single slice\n");
4541		return;
4542	}
4543
4544	/* try to send a reset command to the card to see if it
4545	   is alive */
4546	memset(&cmd, 0, sizeof (cmd));
4547	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4548	if (status != 0) {
4549		device_printf(sc->dev, "failed reset\n");
4550		goto abort_with_fw;
4551	}
4552
4553	/* get rx ring size */
4554	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4555	if (status != 0) {
4556		device_printf(sc->dev, "Cannot determine rx ring size\n");
4557		goto abort_with_fw;
4558	}
4559	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4560
4561	/* tell it the size of the interrupt queues */
4562	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4563	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4564	if (status != 0) {
4565		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4566		goto abort_with_fw;
4567	}
4568
4569	/* ask the maximum number of slices it supports */
4570	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4571	if (status != 0) {
4572		device_printf(sc->dev,
4573			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4574		goto abort_with_fw;
4575	}
4576	sc->num_slices = cmd.data0;
4577	if (sc->num_slices > msix_cnt)
4578		sc->num_slices = msix_cnt;
4579
4580	if (mxge_max_slices == -1) {
4581		/* cap to number of CPUs in system */
4582		if (sc->num_slices > mp_ncpus)
4583			sc->num_slices = mp_ncpus;
4584	} else {
4585		if (sc->num_slices > mxge_max_slices)
4586			sc->num_slices = mxge_max_slices;
4587	}
4588	/* make sure it is a power of two */
4589	while (sc->num_slices & (sc->num_slices - 1))
4590		sc->num_slices--;
4591
4592	if (mxge_verbose)
4593		device_printf(sc->dev, "using %d slices\n",
4594			      sc->num_slices);
4595
4596	return;
4597
4598abort_with_fw:
4599	sc->fw_name = old_fw;
4600	(void) mxge_load_firmware(sc, 0);
4601}
4602
4603static int
4604mxge_add_msix_irqs(mxge_softc_t *sc)
4605{
4606	size_t bytes;
4607	int count, err, i, rid;
4608
4609	rid = PCIR_BAR(2);
4610	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4611						    &rid, RF_ACTIVE);
4612
4613	if (sc->msix_table_res == NULL) {
4614		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4615		return ENXIO;
4616	}
4617
4618	count = sc->num_slices;
4619	err = pci_alloc_msix(sc->dev, &count);
4620	if (err != 0) {
4621		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4622			      "err = %d \n", sc->num_slices, err);
4623		goto abort_with_msix_table;
4624	}
4625	if (count < sc->num_slices) {
4626		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4627			      count, sc->num_slices);
4628		device_printf(sc->dev,
4629			      "Try setting hw.mxge.max_slices to %d\n",
4630			      count);
4631		err = ENOSPC;
4632		goto abort_with_msix;
4633	}
4634	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4635	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4636	if (sc->msix_irq_res == NULL) {
4637		err = ENOMEM;
4638		goto abort_with_msix;
4639	}
4640
4641	for (i = 0; i < sc->num_slices; i++) {
4642		rid = i + 1;
4643		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4644							  SYS_RES_IRQ,
4645							  &rid, RF_ACTIVE);
4646		if (sc->msix_irq_res[i] == NULL) {
4647			device_printf(sc->dev, "couldn't allocate IRQ res"
4648				      " for message %d\n", i);
4649			err = ENXIO;
4650			goto abort_with_res;
4651		}
4652	}
4653
4654	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4655	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4656
4657	for (i = 0; i < sc->num_slices; i++) {
4658		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4659				     INTR_TYPE_NET | INTR_MPSAFE,
4660#if __FreeBSD_version > 700030
4661				     NULL,
4662#endif
4663				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4664		if (err != 0) {
4665			device_printf(sc->dev, "couldn't setup intr for "
4666				      "message %d\n", i);
4667			goto abort_with_intr;
4668		}
4669		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4670				  sc->msix_ih[i], "s%d", i);
4671	}
4672
4673	if (mxge_verbose) {
4674		device_printf(sc->dev, "using %d msix IRQs:",
4675			      sc->num_slices);
4676		for (i = 0; i < sc->num_slices; i++)
4677			printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4678		printf("\n");
4679	}
4680	return (0);
4681
4682abort_with_intr:
4683	for (i = 0; i < sc->num_slices; i++) {
4684		if (sc->msix_ih[i] != NULL) {
4685			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4686					  sc->msix_ih[i]);
4687			sc->msix_ih[i] = NULL;
4688		}
4689	}
4690	free(sc->msix_ih, M_DEVBUF);
4691
4692
4693abort_with_res:
4694	for (i = 0; i < sc->num_slices; i++) {
4695		rid = i + 1;
4696		if (sc->msix_irq_res[i] != NULL)
4697			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4698					     sc->msix_irq_res[i]);
4699		sc->msix_irq_res[i] = NULL;
4700	}
4701	free(sc->msix_irq_res, M_DEVBUF);
4702
4703
4704abort_with_msix:
4705	pci_release_msi(sc->dev);
4706
4707abort_with_msix_table:
4708	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4709			     sc->msix_table_res);
4710
4711	return err;
4712}
4713
4714static int
4715mxge_add_single_irq(mxge_softc_t *sc)
4716{
4717	int count, err, rid;
4718
4719	count = pci_msi_count(sc->dev);
4720	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4721		rid = 1;
4722	} else {
4723		rid = 0;
4724		sc->legacy_irq = 1;
4725	}
4726	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4727					     RF_SHAREABLE | RF_ACTIVE);
4728	if (sc->irq_res == NULL) {
4729		device_printf(sc->dev, "could not alloc interrupt\n");
4730		return ENXIO;
4731	}
4732	if (mxge_verbose)
4733		device_printf(sc->dev, "using %s irq %jd\n",
4734			      sc->legacy_irq ? "INTx" : "MSI",
4735			      rman_get_start(sc->irq_res));
4736	err = bus_setup_intr(sc->dev, sc->irq_res,
4737			     INTR_TYPE_NET | INTR_MPSAFE,
4738#if __FreeBSD_version > 700030
4739			     NULL,
4740#endif
4741			     mxge_intr, &sc->ss[0], &sc->ih);
4742	if (err != 0) {
4743		bus_release_resource(sc->dev, SYS_RES_IRQ,
4744				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4745		if (!sc->legacy_irq)
4746			pci_release_msi(sc->dev);
4747	}
4748	return err;
4749}
4750
4751static void
4752mxge_rem_msix_irqs(mxge_softc_t *sc)
4753{
4754	int i, rid;
4755
4756	for (i = 0; i < sc->num_slices; i++) {
4757		if (sc->msix_ih[i] != NULL) {
4758			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4759					  sc->msix_ih[i]);
4760			sc->msix_ih[i] = NULL;
4761		}
4762	}
4763	free(sc->msix_ih, M_DEVBUF);
4764
4765	for (i = 0; i < sc->num_slices; i++) {
4766		rid = i + 1;
4767		if (sc->msix_irq_res[i] != NULL)
4768			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4769					     sc->msix_irq_res[i]);
4770		sc->msix_irq_res[i] = NULL;
4771	}
4772	free(sc->msix_irq_res, M_DEVBUF);
4773
4774	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4775			     sc->msix_table_res);
4776
4777	pci_release_msi(sc->dev);
4778	return;
4779}
4780
4781static void
4782mxge_rem_single_irq(mxge_softc_t *sc)
4783{
4784	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4785	bus_release_resource(sc->dev, SYS_RES_IRQ,
4786			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4787	if (!sc->legacy_irq)
4788		pci_release_msi(sc->dev);
4789}
4790
4791static void
4792mxge_rem_irq(mxge_softc_t *sc)
4793{
4794	if (sc->num_slices > 1)
4795		mxge_rem_msix_irqs(sc);
4796	else
4797		mxge_rem_single_irq(sc);
4798}
4799
4800static int
4801mxge_add_irq(mxge_softc_t *sc)
4802{
4803	int err;
4804
4805	if (sc->num_slices > 1)
4806		err = mxge_add_msix_irqs(sc);
4807	else
4808		err = mxge_add_single_irq(sc);
4809
4810	if (0 && err == 0 && sc->num_slices > 1) {
4811		mxge_rem_msix_irqs(sc);
4812		err = mxge_add_msix_irqs(sc);
4813	}
4814	return err;
4815}
4816
4817
4818static int
4819mxge_attach(device_t dev)
4820{
4821	mxge_cmd_t cmd;
4822	mxge_softc_t *sc = device_get_softc(dev);
4823	struct ifnet *ifp;
4824	int err, rid;
4825
4826	sc->dev = dev;
4827	mxge_fetch_tunables(sc);
4828
4829	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4830	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4831				  taskqueue_thread_enqueue, &sc->tq);
4832	if (sc->tq == NULL) {
4833		err = ENOMEM;
4834		goto abort_with_nothing;
4835	}
4836
4837	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4838				 1,			/* alignment */
4839				 0,			/* boundary */
4840				 BUS_SPACE_MAXADDR,	/* low */
4841				 BUS_SPACE_MAXADDR,	/* high */
4842				 NULL, NULL,		/* filter */
4843				 65536 + 256,		/* maxsize */
4844				 MXGE_MAX_SEND_DESC, 	/* num segs */
4845				 65536,			/* maxsegsize */
4846				 0,			/* flags */
4847				 NULL, NULL,		/* lock */
4848				 &sc->parent_dmat);	/* tag */
4849
4850	if (err != 0) {
4851		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4852			      err);
4853		goto abort_with_tq;
4854	}
4855
4856	ifp = sc->ifp = if_alloc(IFT_ETHER);
4857	if (ifp == NULL) {
4858		device_printf(dev, "can not if_alloc()\n");
4859		err = ENOSPC;
4860		goto abort_with_parent_dmat;
4861	}
4862	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4863
4864	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4865		 device_get_nameunit(dev));
4866	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4867	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4868		 "%s:drv", device_get_nameunit(dev));
4869	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4870		 MTX_NETWORK_LOCK, MTX_DEF);
4871
4872	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4873
4874	mxge_setup_cfg_space(sc);
4875
4876	/* Map the board into the kernel */
4877	rid = PCIR_BARS;
4878	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4879					     RF_ACTIVE);
4880	if (sc->mem_res == NULL) {
4881		device_printf(dev, "could not map memory\n");
4882		err = ENXIO;
4883		goto abort_with_lock;
4884	}
4885	sc->sram = rman_get_virtual(sc->mem_res);
4886	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4887	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4888		device_printf(dev, "impossible memory region size %jd\n",
4889			      rman_get_size(sc->mem_res));
4890		err = ENXIO;
4891		goto abort_with_mem_res;
4892	}
4893
4894	/* make NULL terminated copy of the EEPROM strings section of
4895	   lanai SRAM */
4896	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4897	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4898				rman_get_bushandle(sc->mem_res),
4899				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4900				sc->eeprom_strings,
4901				MXGE_EEPROM_STRINGS_SIZE - 2);
4902	err = mxge_parse_strings(sc);
4903	if (err != 0)
4904		goto abort_with_mem_res;
4905
4906	/* Enable write combining for efficient use of PCIe bus */
4907	mxge_enable_wc(sc);
4908
4909	/* Allocate the out of band dma memory */
4910	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4911			     sizeof (mxge_cmd_t), 64);
4912	if (err != 0)
4913		goto abort_with_mem_res;
4914	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4915	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4916	if (err != 0)
4917		goto abort_with_cmd_dma;
4918
4919	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4920	if (err != 0)
4921		goto abort_with_zeropad_dma;
4922
4923	/* select & load the firmware */
4924	err = mxge_select_firmware(sc);
4925	if (err != 0)
4926		goto abort_with_dmabench;
4927	sc->intr_coal_delay = mxge_intr_coal_delay;
4928
4929	mxge_slice_probe(sc);
4930	err = mxge_alloc_slices(sc);
4931	if (err != 0)
4932		goto abort_with_dmabench;
4933
4934	err = mxge_reset(sc, 0);
4935	if (err != 0)
4936		goto abort_with_slices;
4937
4938	err = mxge_alloc_rings(sc);
4939	if (err != 0) {
4940		device_printf(sc->dev, "failed to allocate rings\n");
4941		goto abort_with_slices;
4942	}
4943
4944	err = mxge_add_irq(sc);
4945	if (err != 0) {
4946		device_printf(sc->dev, "failed to add irq\n");
4947		goto abort_with_rings;
4948	}
4949
4950	ifp->if_baudrate = IF_Gbps(10);
4951	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4952		IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4953		IFCAP_RXCSUM_IPV6;
4954#if defined(INET) || defined(INET6)
4955	ifp->if_capabilities |= IFCAP_LRO;
4956#endif
4957
4958#ifdef MXGE_NEW_VLAN_API
4959	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4960
4961	/* Only FW 1.4.32 and newer can do TSO over vlans */
4962	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4963	    sc->fw_ver_tiny >= 32)
4964		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4965#endif
4966	sc->max_mtu = mxge_max_mtu(sc);
4967	if (sc->max_mtu >= 9000)
4968		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4969	else
4970		device_printf(dev, "MTU limited to %d.  Install "
4971			      "latest firmware for 9000 byte jumbo support\n",
4972			      sc->max_mtu - ETHER_HDR_LEN);
4973	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4974	ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4975	/* check to see if f/w supports TSO for IPv6 */
4976	if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4977		if (CSUM_TCP_IPV6)
4978			ifp->if_capabilities |= IFCAP_TSO6;
4979		sc->max_tso6_hlen = min(cmd.data0,
4980					sizeof (sc->ss[0].scratch));
4981	}
4982	ifp->if_capenable = ifp->if_capabilities;
4983	if (sc->lro_cnt == 0)
4984		ifp->if_capenable &= ~IFCAP_LRO;
4985	ifp->if_init = mxge_init;
4986	ifp->if_softc = sc;
4987	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4988	ifp->if_ioctl = mxge_ioctl;
4989	ifp->if_start = mxge_start;
4990	ifp->if_get_counter = mxge_get_counter;
4991	ifp->if_hw_tsomax = IP_MAXPACKET - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
4992	ifp->if_hw_tsomaxsegcount = sc->ss[0].tx.max_desc;
4993	ifp->if_hw_tsomaxsegsize = IP_MAXPACKET;
4994	/* Initialise the ifmedia structure */
4995	ifmedia_init(&sc->media, 0, mxge_media_change,
4996		     mxge_media_status);
4997	mxge_media_init(sc);
4998	mxge_media_probe(sc);
4999	sc->dying = 0;
5000	ether_ifattach(ifp, sc->mac_addr);
5001	/* ether_ifattach sets mtu to ETHERMTU */
5002	if (mxge_initial_mtu != ETHERMTU)
5003		mxge_change_mtu(sc, mxge_initial_mtu);
5004
5005	mxge_add_sysctls(sc);
5006#ifdef IFNET_BUF_RING
5007	ifp->if_transmit = mxge_transmit;
5008	ifp->if_qflush = mxge_qflush;
5009#endif
5010	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
5011				device_get_nameunit(sc->dev));
5012	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
5013	return 0;
5014
5015abort_with_rings:
5016	mxge_free_rings(sc);
5017abort_with_slices:
5018	mxge_free_slices(sc);
5019abort_with_dmabench:
5020	mxge_dma_free(&sc->dmabench_dma);
5021abort_with_zeropad_dma:
5022	mxge_dma_free(&sc->zeropad_dma);
5023abort_with_cmd_dma:
5024	mxge_dma_free(&sc->cmd_dma);
5025abort_with_mem_res:
5026	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5027abort_with_lock:
5028	pci_disable_busmaster(dev);
5029	mtx_destroy(&sc->cmd_mtx);
5030	mtx_destroy(&sc->driver_mtx);
5031	if_free(ifp);
5032abort_with_parent_dmat:
5033	bus_dma_tag_destroy(sc->parent_dmat);
5034abort_with_tq:
5035	if (sc->tq != NULL) {
5036		taskqueue_drain(sc->tq, &sc->watchdog_task);
5037		taskqueue_free(sc->tq);
5038		sc->tq = NULL;
5039	}
5040abort_with_nothing:
5041	return err;
5042}
5043
5044static int
5045mxge_detach(device_t dev)
5046{
5047	mxge_softc_t *sc = device_get_softc(dev);
5048
5049	if (mxge_vlans_active(sc)) {
5050		device_printf(sc->dev,
5051			      "Detach vlans before removing module\n");
5052		return EBUSY;
5053	}
5054	mtx_lock(&sc->driver_mtx);
5055	sc->dying = 1;
5056	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
5057		mxge_close(sc, 0);
5058	mtx_unlock(&sc->driver_mtx);
5059	ether_ifdetach(sc->ifp);
5060	if (sc->tq != NULL) {
5061		taskqueue_drain(sc->tq, &sc->watchdog_task);
5062		taskqueue_free(sc->tq);
5063		sc->tq = NULL;
5064	}
5065	callout_drain(&sc->co_hdl);
5066	ifmedia_removeall(&sc->media);
5067	mxge_dummy_rdma(sc, 0);
5068	mxge_rem_sysctls(sc);
5069	mxge_rem_irq(sc);
5070	mxge_free_rings(sc);
5071	mxge_free_slices(sc);
5072	mxge_dma_free(&sc->dmabench_dma);
5073	mxge_dma_free(&sc->zeropad_dma);
5074	mxge_dma_free(&sc->cmd_dma);
5075	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5076	pci_disable_busmaster(dev);
5077	mtx_destroy(&sc->cmd_mtx);
5078	mtx_destroy(&sc->driver_mtx);
5079	if_free(sc->ifp);
5080	bus_dma_tag_destroy(sc->parent_dmat);
5081	return 0;
5082}
5083
5084static int
5085mxge_shutdown(device_t dev)
5086{
5087	return 0;
5088}
5089
5090/*
5091  This file uses Myri10GE driver indentation.
5092
5093  Local Variables:
5094  c-file-style:"linux"
5095  tab-width:8
5096  End:
5097*/
5098