if_mxge.c revision 197395
1/******************************************************************************
2
3Copyright (c) 2006-2009, Myricom Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Myricom Inc, nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/mxge/if_mxge.c 197395 2009-09-21 20:16:10Z gallatin $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/linker.h>
36#include <sys/firmware.h>
37#include <sys/endian.h>
38#include <sys/sockio.h>
39#include <sys/mbuf.h>
40#include <sys/malloc.h>
41#include <sys/kdb.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/module.h>
45#include <sys/socket.h>
46#include <sys/sysctl.h>
47#include <sys/sx.h>
48
49/* count xmits ourselves, rather than via drbr */
50#define NO_SLOW_STATS
51#include <net/if.h>
52#include <net/if_arp.h>
53#include <net/ethernet.h>
54#include <net/if_dl.h>
55#include <net/if_media.h>
56
57#include <net/bpf.h>
58
59#include <net/if_types.h>
60#include <net/if_vlan_var.h>
61#include <net/zlib.h>
62
63#include <netinet/in_systm.h>
64#include <netinet/in.h>
65#include <netinet/ip.h>
66#include <netinet/tcp.h>
67
68#include <machine/bus.h>
69#include <machine/in_cksum.h>
70#include <machine/resource.h>
71#include <sys/bus.h>
72#include <sys/rman.h>
73#include <sys/smp.h>
74
75#include <dev/pci/pcireg.h>
76#include <dev/pci/pcivar.h>
77#include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
78
79#include <vm/vm.h>		/* for pmap_mapdev() */
80#include <vm/pmap.h>
81
82#if defined(__i386) || defined(__amd64)
83#include <machine/specialreg.h>
84#endif
85
86#include <dev/mxge/mxge_mcp.h>
87#include <dev/mxge/mcp_gen_header.h>
88/*#define MXGE_FAKE_IFP*/
89#include <dev/mxge/if_mxge_var.h>
90#ifdef IFNET_BUF_RING
91#include <sys/buf_ring.h>
92#endif
93
94#include "opt_inet.h"
95
96/* tunable params */
97static int mxge_nvidia_ecrc_enable = 1;
98static int mxge_force_firmware = 0;
99static int mxge_intr_coal_delay = 30;
100static int mxge_deassert_wait = 1;
101static int mxge_flow_control = 1;
102static int mxge_verbose = 0;
103static int mxge_lro_cnt = 8;
104static int mxge_ticks;
105static int mxge_max_slices = 1;
106static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
107static int mxge_always_promisc = 0;
108static int mxge_initial_mtu = ETHERMTU_JUMBO;
109static int mxge_throttle = 0;
110static char *mxge_fw_unaligned = "mxge_ethp_z8e";
111static char *mxge_fw_aligned = "mxge_eth_z8e";
112static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
113static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
114
115static int mxge_probe(device_t dev);
116static int mxge_attach(device_t dev);
117static int mxge_detach(device_t dev);
118static int mxge_shutdown(device_t dev);
119static void mxge_intr(void *arg);
120
121static device_method_t mxge_methods[] =
122{
123  /* Device interface */
124  DEVMETHOD(device_probe, mxge_probe),
125  DEVMETHOD(device_attach, mxge_attach),
126  DEVMETHOD(device_detach, mxge_detach),
127  DEVMETHOD(device_shutdown, mxge_shutdown),
128  {0, 0}
129};
130
131static driver_t mxge_driver =
132{
133  "mxge",
134  mxge_methods,
135  sizeof(mxge_softc_t),
136};
137
138static devclass_t mxge_devclass;
139
140/* Declare ourselves to be a child of the PCI bus.*/
141DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
142MODULE_DEPEND(mxge, firmware, 1, 1, 1);
143MODULE_DEPEND(mxge, zlib, 1, 1, 1);
144
145static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
146static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
147static int mxge_close(mxge_softc_t *sc, int down);
148static int mxge_open(mxge_softc_t *sc);
149static void mxge_tick(void *arg);
150
151static int
152mxge_probe(device_t dev)
153{
154	int rev;
155
156
157	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
158	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
159	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
160		rev = pci_get_revid(dev);
161		switch (rev) {
162		case MXGE_PCI_REV_Z8E:
163			device_set_desc(dev, "Myri10G-PCIE-8A");
164			break;
165		case MXGE_PCI_REV_Z8ES:
166			device_set_desc(dev, "Myri10G-PCIE-8B");
167			break;
168		default:
169			device_set_desc(dev, "Myri10G-PCIE-8??");
170			device_printf(dev, "Unrecognized rev %d NIC\n",
171				      rev);
172			break;
173		}
174		return 0;
175	}
176	return ENXIO;
177}
178
179static void
180mxge_enable_wc(mxge_softc_t *sc)
181{
182#if defined(__i386) || defined(__amd64)
183	vm_offset_t len;
184	int err;
185
186	sc->wc = 1;
187	len = rman_get_size(sc->mem_res);
188	err = pmap_change_attr((vm_offset_t) sc->sram,
189			       len, PAT_WRITE_COMBINING);
190	if (err != 0) {
191		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
192			      err);
193		sc->wc = 0;
194	}
195#endif
196}
197
198
199/* callback to get our DMA address */
200static void
201mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
202			 int error)
203{
204	if (error == 0) {
205		*(bus_addr_t *) arg = segs->ds_addr;
206	}
207}
208
209static int
210mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
211		   bus_size_t alignment)
212{
213	int err;
214	device_t dev = sc->dev;
215	bus_size_t boundary, maxsegsize;
216
217	if (bytes > 4096 && alignment == 4096) {
218		boundary = 0;
219		maxsegsize = bytes;
220	} else {
221		boundary = 4096;
222		maxsegsize = 4096;
223	}
224
225	/* allocate DMAable memory tags */
226	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
227				 alignment,		/* alignment */
228				 boundary,		/* boundary */
229				 BUS_SPACE_MAXADDR,	/* low */
230				 BUS_SPACE_MAXADDR,	/* high */
231				 NULL, NULL,		/* filter */
232				 bytes,			/* maxsize */
233				 1,			/* num segs */
234				 maxsegsize,		/* maxsegsize */
235				 BUS_DMA_COHERENT,	/* flags */
236				 NULL, NULL,		/* lock */
237				 &dma->dmat);		/* tag */
238	if (err != 0) {
239		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
240		return err;
241	}
242
243	/* allocate DMAable memory & map */
244	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
245			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
246				| BUS_DMA_ZERO),  &dma->map);
247	if (err != 0) {
248		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
249		goto abort_with_dmat;
250	}
251
252	/* load the memory */
253	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
254			      mxge_dmamap_callback,
255			      (void *)&dma->bus_addr, 0);
256	if (err != 0) {
257		device_printf(dev, "couldn't load map (err = %d)\n", err);
258		goto abort_with_mem;
259	}
260	return 0;
261
262abort_with_mem:
263	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
264abort_with_dmat:
265	(void)bus_dma_tag_destroy(dma->dmat);
266	return err;
267}
268
269
270static void
271mxge_dma_free(mxge_dma_t *dma)
272{
273	bus_dmamap_unload(dma->dmat, dma->map);
274	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
275	(void)bus_dma_tag_destroy(dma->dmat);
276}
277
278/*
279 * The eeprom strings on the lanaiX have the format
280 * SN=x\0
281 * MAC=x:x:x:x:x:x\0
282 * PC=text\0
283 */
284
285static int
286mxge_parse_strings(mxge_softc_t *sc)
287{
288#define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
289
290	char *ptr, *limit;
291	int i, found_mac;
292
293	ptr = sc->eeprom_strings;
294	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
295	found_mac = 0;
296	while (ptr < limit && *ptr != '\0') {
297		if (memcmp(ptr, "MAC=", 4) == 0) {
298			ptr += 1;
299			sc->mac_addr_string = ptr;
300			for (i = 0; i < 6; i++) {
301				ptr += 3;
302				if ((ptr + 2) > limit)
303					goto abort;
304				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
305				found_mac = 1;
306			}
307		} else if (memcmp(ptr, "PC=", 3) == 0) {
308			ptr += 3;
309			strncpy(sc->product_code_string, ptr,
310				sizeof (sc->product_code_string) - 1);
311		} else if (memcmp(ptr, "SN=", 3) == 0) {
312			ptr += 3;
313			strncpy(sc->serial_number_string, ptr,
314				sizeof (sc->serial_number_string) - 1);
315		}
316		MXGE_NEXT_STRING(ptr);
317	}
318
319	if (found_mac)
320		return 0;
321
322 abort:
323	device_printf(sc->dev, "failed to parse eeprom_strings\n");
324
325	return ENXIO;
326}
327
328#if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
329static void
330mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
331{
332	uint32_t val;
333	unsigned long base, off;
334	char *va, *cfgptr;
335	device_t pdev, mcp55;
336	uint16_t vendor_id, device_id, word;
337	uintptr_t bus, slot, func, ivend, idev;
338	uint32_t *ptr32;
339
340
341	if (!mxge_nvidia_ecrc_enable)
342		return;
343
344	pdev = device_get_parent(device_get_parent(sc->dev));
345	if (pdev == NULL) {
346		device_printf(sc->dev, "could not find parent?\n");
347		return;
348	}
349	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
350	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
351
352	if (vendor_id != 0x10de)
353		return;
354
355	base = 0;
356
357	if (device_id == 0x005d) {
358		/* ck804, base address is magic */
359		base = 0xe0000000UL;
360	} else if (device_id >= 0x0374 && device_id <= 0x378) {
361		/* mcp55, base address stored in chipset */
362		mcp55 = pci_find_bsf(0, 0, 0);
363		if (mcp55 &&
364		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
365		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
366			word = pci_read_config(mcp55, 0x90, 2);
367			base = ((unsigned long)word & 0x7ffeU) << 25;
368		}
369	}
370	if (!base)
371		return;
372
373	/* XXXX
374	   Test below is commented because it is believed that doing
375	   config read/write beyond 0xff will access the config space
376	   for the next larger function.  Uncomment this and remove
377	   the hacky pmap_mapdev() way of accessing config space when
378	   FreeBSD grows support for extended pcie config space access
379	*/
380#if 0
381	/* See if we can, by some miracle, access the extended
382	   config space */
383	val = pci_read_config(pdev, 0x178, 4);
384	if (val != 0xffffffff) {
385		val |= 0x40;
386		pci_write_config(pdev, 0x178, val, 4);
387		return;
388	}
389#endif
390	/* Rather than using normal pci config space writes, we must
391	 * map the Nvidia config space ourselves.  This is because on
392	 * opteron/nvidia class machine the 0xe000000 mapping is
393	 * handled by the nvidia chipset, that means the internal PCI
394	 * device (the on-chip northbridge), or the amd-8131 bridge
395	 * and things behind them are not visible by this method.
396	 */
397
398	BUS_READ_IVAR(device_get_parent(pdev), pdev,
399		      PCI_IVAR_BUS, &bus);
400	BUS_READ_IVAR(device_get_parent(pdev), pdev,
401		      PCI_IVAR_SLOT, &slot);
402	BUS_READ_IVAR(device_get_parent(pdev), pdev,
403		      PCI_IVAR_FUNCTION, &func);
404	BUS_READ_IVAR(device_get_parent(pdev), pdev,
405		      PCI_IVAR_VENDOR, &ivend);
406	BUS_READ_IVAR(device_get_parent(pdev), pdev,
407		      PCI_IVAR_DEVICE, &idev);
408
409	off =  base
410		+ 0x00100000UL * (unsigned long)bus
411		+ 0x00001000UL * (unsigned long)(func
412						 + 8 * slot);
413
414	/* map it into the kernel */
415	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
416
417
418	if (va == NULL) {
419		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
420		return;
421	}
422	/* get a pointer to the config space mapped into the kernel */
423	cfgptr = va + (off & PAGE_MASK);
424
425	/* make sure that we can really access it */
426	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
427	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
428	if (! (vendor_id == ivend && device_id == idev)) {
429		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
430			      vendor_id, device_id);
431		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
432		return;
433	}
434
435	ptr32 = (uint32_t*)(cfgptr + 0x178);
436	val = *ptr32;
437
438	if (val == 0xffffffff) {
439		device_printf(sc->dev, "extended mapping failed\n");
440		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
441		return;
442	}
443	*ptr32 = val | 0x40;
444	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
445	if (mxge_verbose)
446		device_printf(sc->dev,
447			      "Enabled ECRC on upstream Nvidia bridge "
448			      "at %d:%d:%d\n",
449			      (int)bus, (int)slot, (int)func);
450	return;
451}
452#else
453static void
454mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
455{
456	device_printf(sc->dev,
457		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
458	return;
459}
460#endif
461
462
463static int
464mxge_dma_test(mxge_softc_t *sc, int test_type)
465{
466	mxge_cmd_t cmd;
467	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
468	int status;
469	uint32_t len;
470	char *test = " ";
471
472
473	/* Run a small DMA test.
474	 * The magic multipliers to the length tell the firmware
475	 * to do DMA read, write, or read+write tests.  The
476	 * results are returned in cmd.data0.  The upper 16
477	 * bits of the return is the number of transfers completed.
478	 * The lower 16 bits is the time in 0.5us ticks that the
479	 * transfers took to complete.
480	 */
481
482	len = sc->tx_boundary;
483
484	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
485	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
486	cmd.data2 = len * 0x10000;
487	status = mxge_send_cmd(sc, test_type, &cmd);
488	if (status != 0) {
489		test = "read";
490		goto abort;
491	}
492	sc->read_dma = ((cmd.data0>>16) * len * 2) /
493		(cmd.data0 & 0xffff);
494	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
495	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
496	cmd.data2 = len * 0x1;
497	status = mxge_send_cmd(sc, test_type, &cmd);
498	if (status != 0) {
499		test = "write";
500		goto abort;
501	}
502	sc->write_dma = ((cmd.data0>>16) * len * 2) /
503		(cmd.data0 & 0xffff);
504
505	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
506	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
507	cmd.data2 = len * 0x10001;
508	status = mxge_send_cmd(sc, test_type, &cmd);
509	if (status != 0) {
510		test = "read/write";
511		goto abort;
512	}
513	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
514		(cmd.data0 & 0xffff);
515
516abort:
517	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
518		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
519			      test, status);
520
521	return status;
522}
523
524/*
525 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
526 * when the PCI-E Completion packets are aligned on an 8-byte
527 * boundary.  Some PCI-E chip sets always align Completion packets; on
528 * the ones that do not, the alignment can be enforced by enabling
529 * ECRC generation (if supported).
530 *
531 * When PCI-E Completion packets are not aligned, it is actually more
532 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
533 *
534 * If the driver can neither enable ECRC nor verify that it has
535 * already been enabled, then it must use a firmware image which works
536 * around unaligned completion packets (ethp_z8e.dat), and it should
537 * also ensure that it never gives the device a Read-DMA which is
538 * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
539 * enabled, then the driver should use the aligned (eth_z8e.dat)
540 * firmware image, and set tx_boundary to 4KB.
541 */
542
543static int
544mxge_firmware_probe(mxge_softc_t *sc)
545{
546	device_t dev = sc->dev;
547	int reg, status;
548	uint16_t pectl;
549
550	sc->tx_boundary = 4096;
551	/*
552	 * Verify the max read request size was set to 4KB
553	 * before trying the test with 4KB.
554	 */
555	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
556		pectl = pci_read_config(dev, reg + 0x8, 2);
557		if ((pectl & (5 << 12)) != (5 << 12)) {
558			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
559				      pectl);
560			sc->tx_boundary = 2048;
561		}
562	}
563
564	/*
565	 * load the optimized firmware (which assumes aligned PCIe
566	 * completions) in order to see if it works on this host.
567	 */
568	sc->fw_name = mxge_fw_aligned;
569	status = mxge_load_firmware(sc, 1);
570	if (status != 0) {
571		return status;
572	}
573
574	/*
575	 * Enable ECRC if possible
576	 */
577	mxge_enable_nvidia_ecrc(sc);
578
579	/*
580	 * Run a DMA test which watches for unaligned completions and
581	 * aborts on the first one seen.
582	 */
583
584	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
585	if (status == 0)
586		return 0; /* keep the aligned firmware */
587
588	if (status != E2BIG)
589		device_printf(dev, "DMA test failed: %d\n", status);
590	if (status == ENOSYS)
591		device_printf(dev, "Falling back to ethp! "
592			      "Please install up to date fw\n");
593	return status;
594}
595
596static int
597mxge_select_firmware(mxge_softc_t *sc)
598{
599	int aligned = 0;
600	int force_firmware = mxge_force_firmware;
601
602	if (sc->throttle)
603		force_firmware = sc->throttle;
604
605	if (force_firmware != 0) {
606		if (force_firmware == 1)
607			aligned = 1;
608		else
609			aligned = 0;
610		if (mxge_verbose)
611			device_printf(sc->dev,
612				      "Assuming %s completions (forced)\n",
613				      aligned ? "aligned" : "unaligned");
614		goto abort;
615	}
616
617	/* if the PCIe link width is 4 or less, we can use the aligned
618	   firmware and skip any checks */
619	if (sc->link_width != 0 && sc->link_width <= 4) {
620		device_printf(sc->dev,
621			      "PCIe x%d Link, expect reduced performance\n",
622			      sc->link_width);
623		aligned = 1;
624		goto abort;
625	}
626
627	if (0 == mxge_firmware_probe(sc))
628		return 0;
629
630abort:
631	if (aligned) {
632		sc->fw_name = mxge_fw_aligned;
633		sc->tx_boundary = 4096;
634	} else {
635		sc->fw_name = mxge_fw_unaligned;
636		sc->tx_boundary = 2048;
637	}
638	return (mxge_load_firmware(sc, 0));
639}
640
641union qualhack
642{
643        const char *ro_char;
644        char *rw_char;
645};
646
647static int
648mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
649{
650
651
652	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
653		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
654			      be32toh(hdr->mcp_type));
655		return EIO;
656	}
657
658	/* save firmware version for sysctl */
659	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
660	if (mxge_verbose)
661		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
662
663	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
664	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
665
666	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
667	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
668		device_printf(sc->dev, "Found firmware version %s\n",
669			      sc->fw_version);
670		device_printf(sc->dev, "Driver needs %d.%d\n",
671			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
672		return EINVAL;
673	}
674	return 0;
675
676}
677
678static void *
679z_alloc(void *nil, u_int items, u_int size)
680{
681        void *ptr;
682
683        ptr = malloc(items * size, M_TEMP, M_NOWAIT);
684        return ptr;
685}
686
687static void
688z_free(void *nil, void *ptr)
689{
690        free(ptr, M_TEMP);
691}
692
693
694static int
695mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
696{
697	z_stream zs;
698	char *inflate_buffer;
699	const struct firmware *fw;
700	const mcp_gen_header_t *hdr;
701	unsigned hdr_offset;
702	int status;
703	unsigned int i;
704	char dummy;
705	size_t fw_len;
706
707	fw = firmware_get(sc->fw_name);
708	if (fw == NULL) {
709		device_printf(sc->dev, "Could not find firmware image %s\n",
710			      sc->fw_name);
711		return ENOENT;
712	}
713
714
715
716	/* setup zlib and decompress f/w */
717	bzero(&zs, sizeof (zs));
718	zs.zalloc = z_alloc;
719	zs.zfree = z_free;
720	status = inflateInit(&zs);
721	if (status != Z_OK) {
722		status = EIO;
723		goto abort_with_fw;
724	}
725
726	/* the uncompressed size is stored as the firmware version,
727	   which would otherwise go unused */
728	fw_len = (size_t) fw->version;
729	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
730	if (inflate_buffer == NULL)
731		goto abort_with_zs;
732	zs.avail_in = fw->datasize;
733	zs.next_in = __DECONST(char *, fw->data);
734	zs.avail_out = fw_len;
735	zs.next_out = inflate_buffer;
736	status = inflate(&zs, Z_FINISH);
737	if (status != Z_STREAM_END) {
738		device_printf(sc->dev, "zlib %d\n", status);
739		status = EIO;
740		goto abort_with_buffer;
741	}
742
743	/* check id */
744	hdr_offset = htobe32(*(const uint32_t *)
745			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
746	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
747		device_printf(sc->dev, "Bad firmware file");
748		status = EIO;
749		goto abort_with_buffer;
750	}
751	hdr = (const void*)(inflate_buffer + hdr_offset);
752
753	status = mxge_validate_firmware(sc, hdr);
754	if (status != 0)
755		goto abort_with_buffer;
756
757	/* Copy the inflated firmware to NIC SRAM. */
758	for (i = 0; i < fw_len; i += 256) {
759		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
760			      inflate_buffer + i,
761			      min(256U, (unsigned)(fw_len - i)));
762		wmb();
763		dummy = *sc->sram;
764		wmb();
765	}
766
767	*limit = fw_len;
768	status = 0;
769abort_with_buffer:
770	free(inflate_buffer, M_TEMP);
771abort_with_zs:
772	inflateEnd(&zs);
773abort_with_fw:
774	firmware_put(fw, FIRMWARE_UNLOAD);
775	return status;
776}
777
778/*
779 * Enable or disable periodic RDMAs from the host to make certain
780 * chipsets resend dropped PCIe messages
781 */
782
783static void
784mxge_dummy_rdma(mxge_softc_t *sc, int enable)
785{
786	char buf_bytes[72];
787	volatile uint32_t *confirm;
788	volatile char *submit;
789	uint32_t *buf, dma_low, dma_high;
790	int i;
791
792	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
793
794	/* clear confirmation addr */
795	confirm = (volatile uint32_t *)sc->cmd;
796	*confirm = 0;
797	wmb();
798
799	/* send an rdma command to the PCIe engine, and wait for the
800	   response in the confirmation address.  The firmware should
801	   write a -1 there to indicate it is alive and well
802	*/
803
804	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
805	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
806	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
807	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
808	buf[2] = htobe32(0xffffffff);		/* confirm data */
809	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
810	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
811	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
812	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
813	buf[5] = htobe32(enable);			/* enable? */
814
815
816	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
817
818	mxge_pio_copy(submit, buf, 64);
819	wmb();
820	DELAY(1000);
821	wmb();
822	i = 0;
823	while (*confirm != 0xffffffff && i < 20) {
824		DELAY(1000);
825		i++;
826	}
827	if (*confirm != 0xffffffff) {
828		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
829			      (enable ? "enable" : "disable"), confirm,
830			      *confirm);
831	}
832	return;
833}
834
835static int
836mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
837{
838	mcp_cmd_t *buf;
839	char buf_bytes[sizeof(*buf) + 8];
840	volatile mcp_cmd_response_t *response = sc->cmd;
841	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
842	uint32_t dma_low, dma_high;
843	int err, sleep_total = 0;
844
845	/* ensure buf is aligned to 8 bytes */
846	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
847
848	buf->data0 = htobe32(data->data0);
849	buf->data1 = htobe32(data->data1);
850	buf->data2 = htobe32(data->data2);
851	buf->cmd = htobe32(cmd);
852	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
853	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
854
855	buf->response_addr.low = htobe32(dma_low);
856	buf->response_addr.high = htobe32(dma_high);
857	mtx_lock(&sc->cmd_mtx);
858	response->result = 0xffffffff;
859	wmb();
860	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
861
862	/* wait up to 20ms */
863	err = EAGAIN;
864	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
865		bus_dmamap_sync(sc->cmd_dma.dmat,
866				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
867		wmb();
868		switch (be32toh(response->result)) {
869		case 0:
870			data->data0 = be32toh(response->data);
871			err = 0;
872			break;
873		case 0xffffffff:
874			DELAY(1000);
875			break;
876		case MXGEFW_CMD_UNKNOWN:
877			err = ENOSYS;
878			break;
879		case MXGEFW_CMD_ERROR_UNALIGNED:
880			err = E2BIG;
881			break;
882		case MXGEFW_CMD_ERROR_BUSY:
883			err = EBUSY;
884			break;
885		default:
886			device_printf(sc->dev,
887				      "mxge: command %d "
888				      "failed, result = %d\n",
889				      cmd, be32toh(response->result));
890			err = ENXIO;
891			break;
892		}
893		if (err != EAGAIN)
894			break;
895	}
896	if (err == EAGAIN)
897		device_printf(sc->dev, "mxge: command %d timed out"
898			      "result = %d\n",
899			      cmd, be32toh(response->result));
900	mtx_unlock(&sc->cmd_mtx);
901	return err;
902}
903
904static int
905mxge_adopt_running_firmware(mxge_softc_t *sc)
906{
907	struct mcp_gen_header *hdr;
908	const size_t bytes = sizeof (struct mcp_gen_header);
909	size_t hdr_offset;
910	int status;
911
912	/* find running firmware header */
913	hdr_offset = htobe32(*(volatile uint32_t *)
914			     (sc->sram + MCP_HEADER_PTR_OFFSET));
915
916	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
917		device_printf(sc->dev,
918			      "Running firmware has bad header offset (%d)\n",
919			      (int)hdr_offset);
920		return EIO;
921	}
922
923	/* copy header of running firmware from SRAM to host memory to
924	 * validate firmware */
925	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
926	if (hdr == NULL) {
927		device_printf(sc->dev, "could not malloc firmware hdr\n");
928		return ENOMEM;
929	}
930	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
931				rman_get_bushandle(sc->mem_res),
932				hdr_offset, (char *)hdr, bytes);
933	status = mxge_validate_firmware(sc, hdr);
934	free(hdr, M_DEVBUF);
935
936	/*
937	 * check to see if adopted firmware has bug where adopting
938	 * it will cause broadcasts to be filtered unless the NIC
939	 * is kept in ALLMULTI mode
940	 */
941	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
942	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
943		sc->adopted_rx_filter_bug = 1;
944		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
945			      "working around rx filter bug\n",
946			      sc->fw_ver_major, sc->fw_ver_minor,
947			      sc->fw_ver_tiny);
948	}
949
950	return status;
951}
952
953
954static int
955mxge_load_firmware(mxge_softc_t *sc, int adopt)
956{
957	volatile uint32_t *confirm;
958	volatile char *submit;
959	char buf_bytes[72];
960	uint32_t *buf, size, dma_low, dma_high;
961	int status, i;
962
963	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
964
965	size = sc->sram_size;
966	status = mxge_load_firmware_helper(sc, &size);
967	if (status) {
968		if (!adopt)
969			return status;
970		/* Try to use the currently running firmware, if
971		   it is new enough */
972		status = mxge_adopt_running_firmware(sc);
973		if (status) {
974			device_printf(sc->dev,
975				      "failed to adopt running firmware\n");
976			return status;
977		}
978		device_printf(sc->dev,
979			      "Successfully adopted running firmware\n");
980		if (sc->tx_boundary == 4096) {
981			device_printf(sc->dev,
982				"Using firmware currently running on NIC"
983				 ".  For optimal\n");
984			device_printf(sc->dev,
985				 "performance consider loading optimized "
986				 "firmware\n");
987		}
988		sc->fw_name = mxge_fw_unaligned;
989		sc->tx_boundary = 2048;
990		return 0;
991	}
992	/* clear confirmation addr */
993	confirm = (volatile uint32_t *)sc->cmd;
994	*confirm = 0;
995	wmb();
996	/* send a reload command to the bootstrap MCP, and wait for the
997	   response in the confirmation address.  The firmware should
998	   write a -1 there to indicate it is alive and well
999	*/
1000
1001	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1002	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1003
1004	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1005	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1006	buf[2] = htobe32(0xffffffff);	/* confirm data */
1007
1008	/* FIX: All newest firmware should un-protect the bottom of
1009	   the sram before handoff. However, the very first interfaces
1010	   do not. Therefore the handoff copy must skip the first 8 bytes
1011	*/
1012					/* where the code starts*/
1013	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1014	buf[4] = htobe32(size - 8); 	/* length of code */
1015	buf[5] = htobe32(8);		/* where to copy to */
1016	buf[6] = htobe32(0);		/* where to jump to */
1017
1018	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1019	mxge_pio_copy(submit, buf, 64);
1020	wmb();
1021	DELAY(1000);
1022	wmb();
1023	i = 0;
1024	while (*confirm != 0xffffffff && i < 20) {
1025		DELAY(1000*10);
1026		i++;
1027		bus_dmamap_sync(sc->cmd_dma.dmat,
1028				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1029	}
1030	if (*confirm != 0xffffffff) {
1031		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1032			confirm, *confirm);
1033
1034		return ENXIO;
1035	}
1036	return 0;
1037}
1038
1039static int
1040mxge_update_mac_address(mxge_softc_t *sc)
1041{
1042	mxge_cmd_t cmd;
1043	uint8_t *addr = sc->mac_addr;
1044	int status;
1045
1046
1047	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1048		     | (addr[2] << 8) | addr[3]);
1049
1050	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1051
1052	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1053	return status;
1054}
1055
1056static int
1057mxge_change_pause(mxge_softc_t *sc, int pause)
1058{
1059	mxge_cmd_t cmd;
1060	int status;
1061
1062	if (pause)
1063		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1064				       &cmd);
1065	else
1066		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1067				       &cmd);
1068
1069	if (status) {
1070		device_printf(sc->dev, "Failed to set flow control mode\n");
1071		return ENXIO;
1072	}
1073	sc->pause = pause;
1074	return 0;
1075}
1076
1077static void
1078mxge_change_promisc(mxge_softc_t *sc, int promisc)
1079{
1080	mxge_cmd_t cmd;
1081	int status;
1082
1083	if (mxge_always_promisc)
1084		promisc = 1;
1085
1086	if (promisc)
1087		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1088				       &cmd);
1089	else
1090		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1091				       &cmd);
1092
1093	if (status) {
1094		device_printf(sc->dev, "Failed to set promisc mode\n");
1095	}
1096}
1097
1098static void
1099mxge_set_multicast_list(mxge_softc_t *sc)
1100{
1101	mxge_cmd_t cmd;
1102	struct ifmultiaddr *ifma;
1103	struct ifnet *ifp = sc->ifp;
1104	int err;
1105
1106	/* This firmware is known to not support multicast */
1107	if (!sc->fw_multicast_support)
1108		return;
1109
1110	/* Disable multicast filtering while we play with the lists*/
1111	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1112	if (err != 0) {
1113		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1114		       " error status: %d\n", err);
1115		return;
1116	}
1117
1118	if (sc->adopted_rx_filter_bug)
1119		return;
1120
1121	if (ifp->if_flags & IFF_ALLMULTI)
1122		/* request to disable multicast filtering, so quit here */
1123		return;
1124
1125	/* Flush all the filters */
1126
1127	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1128	if (err != 0) {
1129		device_printf(sc->dev,
1130			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1131			      ", error status: %d\n", err);
1132		return;
1133	}
1134
1135	/* Walk the multicast list, and add each address */
1136
1137	if_maddr_rlock(ifp);
1138	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1139		if (ifma->ifma_addr->sa_family != AF_LINK)
1140			continue;
1141		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1142		      &cmd.data0, 4);
1143		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1144		      &cmd.data1, 2);
1145		cmd.data0 = htonl(cmd.data0);
1146		cmd.data1 = htonl(cmd.data1);
1147		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1148		if (err != 0) {
1149			device_printf(sc->dev, "Failed "
1150			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1151			       "%d\t", err);
1152			/* abort, leaving multicast filtering off */
1153			if_maddr_runlock(ifp);
1154			return;
1155		}
1156	}
1157	if_maddr_runlock(ifp);
1158	/* Enable multicast filtering */
1159	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1160	if (err != 0) {
1161		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1162		       ", error status: %d\n", err);
1163	}
1164}
1165
1166static int
1167mxge_max_mtu(mxge_softc_t *sc)
1168{
1169	mxge_cmd_t cmd;
1170	int status;
1171
1172	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1173		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1174
1175	/* try to set nbufs to see if it we can
1176	   use virtually contiguous jumbos */
1177	cmd.data0 = 0;
1178	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1179			       &cmd);
1180	if (status == 0)
1181		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1182
1183	/* otherwise, we're limited to MJUMPAGESIZE */
1184	return MJUMPAGESIZE - MXGEFW_PAD;
1185}
1186
1187static int
1188mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1189{
1190	struct mxge_slice_state *ss;
1191	mxge_rx_done_t *rx_done;
1192	volatile uint32_t *irq_claim;
1193	mxge_cmd_t cmd;
1194	int slice, status;
1195
1196	/* try to send a reset command to the card to see if it
1197	   is alive */
1198	memset(&cmd, 0, sizeof (cmd));
1199	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1200	if (status != 0) {
1201		device_printf(sc->dev, "failed reset\n");
1202		return ENXIO;
1203	}
1204
1205	mxge_dummy_rdma(sc, 1);
1206
1207
1208	/* set the intrq size */
1209	cmd.data0 = sc->rx_ring_size;
1210	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1211
1212	/*
1213	 * Even though we already know how many slices are supported
1214	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1215	 * has magic side effects, and must be called after a reset.
1216	 * It must be called prior to calling any RSS related cmds,
1217	 * including assigning an interrupt queue for anything but
1218	 * slice 0.  It must also be called *after*
1219	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1220	 * the firmware to compute offsets.
1221	 */
1222
1223	if (sc->num_slices > 1) {
1224		/* ask the maximum number of slices it supports */
1225		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1226					   &cmd);
1227		if (status != 0) {
1228			device_printf(sc->dev,
1229				      "failed to get number of slices\n");
1230			return status;
1231		}
1232		/*
1233		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1234		 * to setting up the interrupt queue DMA
1235		 */
1236		cmd.data0 = sc->num_slices;
1237		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1238#ifdef IFNET_BUF_RING
1239		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1240#endif
1241		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1242					   &cmd);
1243		if (status != 0) {
1244			device_printf(sc->dev,
1245				      "failed to set number of slices\n");
1246			return status;
1247		}
1248	}
1249
1250
1251	if (interrupts_setup) {
1252		/* Now exchange information about interrupts  */
1253		for (slice = 0; slice < sc->num_slices; slice++) {
1254			rx_done = &sc->ss[slice].rx_done;
1255			memset(rx_done->entry, 0, sc->rx_ring_size);
1256			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1257			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1258			cmd.data2 = slice;
1259			status |= mxge_send_cmd(sc,
1260						MXGEFW_CMD_SET_INTRQ_DMA,
1261						&cmd);
1262		}
1263	}
1264
1265	status |= mxge_send_cmd(sc,
1266				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1267
1268
1269	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1270
1271	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1272	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1273
1274
1275	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1276				&cmd);
1277	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1278	if (status != 0) {
1279		device_printf(sc->dev, "failed set interrupt parameters\n");
1280		return status;
1281	}
1282
1283
1284	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1285
1286
1287	/* run a DMA benchmark */
1288	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1289
1290	for (slice = 0; slice < sc->num_slices; slice++) {
1291		ss = &sc->ss[slice];
1292
1293		ss->irq_claim = irq_claim + (2 * slice);
1294		/* reset mcp/driver shared state back to 0 */
1295		ss->rx_done.idx = 0;
1296		ss->rx_done.cnt = 0;
1297		ss->tx.req = 0;
1298		ss->tx.done = 0;
1299		ss->tx.pkt_done = 0;
1300		ss->tx.queue_active = 0;
1301		ss->tx.activate = 0;
1302		ss->tx.deactivate = 0;
1303		ss->tx.wake = 0;
1304		ss->tx.defrag = 0;
1305		ss->tx.stall = 0;
1306		ss->rx_big.cnt = 0;
1307		ss->rx_small.cnt = 0;
1308		ss->lro_bad_csum = 0;
1309		ss->lro_queued = 0;
1310		ss->lro_flushed = 0;
1311		if (ss->fw_stats != NULL) {
1312			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1313		}
1314	}
1315	sc->rdma_tags_available = 15;
1316	status = mxge_update_mac_address(sc);
1317	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1318	mxge_change_pause(sc, sc->pause);
1319	mxge_set_multicast_list(sc);
1320	if (sc->throttle) {
1321		cmd.data0 = sc->throttle;
1322		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1323				  &cmd)) {
1324			device_printf(sc->dev,
1325				      "can't enable throttle\n");
1326		}
1327	}
1328	return status;
1329}
1330
1331static int
1332mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1333{
1334	mxge_cmd_t cmd;
1335	mxge_softc_t *sc;
1336	int err;
1337	unsigned int throttle;
1338
1339	sc = arg1;
1340	throttle = sc->throttle;
1341	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1342        if (err != 0) {
1343                return err;
1344        }
1345
1346	if (throttle == sc->throttle)
1347		return 0;
1348
1349        if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1350                return EINVAL;
1351
1352	mtx_lock(&sc->driver_mtx);
1353	cmd.data0 = throttle;
1354	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1355	if (err == 0)
1356		sc->throttle = throttle;
1357	mtx_unlock(&sc->driver_mtx);
1358	return err;
1359}
1360
1361static int
1362mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1363{
1364        mxge_softc_t *sc;
1365        unsigned int intr_coal_delay;
1366        int err;
1367
1368        sc = arg1;
1369        intr_coal_delay = sc->intr_coal_delay;
1370        err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1371        if (err != 0) {
1372                return err;
1373        }
1374        if (intr_coal_delay == sc->intr_coal_delay)
1375                return 0;
1376
1377        if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1378                return EINVAL;
1379
1380	mtx_lock(&sc->driver_mtx);
1381	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1382	sc->intr_coal_delay = intr_coal_delay;
1383
1384	mtx_unlock(&sc->driver_mtx);
1385        return err;
1386}
1387
1388static int
1389mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1390{
1391        mxge_softc_t *sc;
1392        unsigned int enabled;
1393        int err;
1394
1395        sc = arg1;
1396        enabled = sc->pause;
1397        err = sysctl_handle_int(oidp, &enabled, arg2, req);
1398        if (err != 0) {
1399                return err;
1400        }
1401        if (enabled == sc->pause)
1402                return 0;
1403
1404	mtx_lock(&sc->driver_mtx);
1405	err = mxge_change_pause(sc, enabled);
1406	mtx_unlock(&sc->driver_mtx);
1407        return err;
1408}
1409
1410static int
1411mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1412{
1413	struct ifnet *ifp;
1414	int err = 0;
1415
1416	ifp = sc->ifp;
1417	if (lro_cnt == 0)
1418		ifp->if_capenable &= ~IFCAP_LRO;
1419	else
1420		ifp->if_capenable |= IFCAP_LRO;
1421	sc->lro_cnt = lro_cnt;
1422	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1423		mxge_close(sc, 0);
1424		err = mxge_open(sc);
1425	}
1426	return err;
1427}
1428
1429static int
1430mxge_change_lro(SYSCTL_HANDLER_ARGS)
1431{
1432	mxge_softc_t *sc;
1433	unsigned int lro_cnt;
1434	int err;
1435
1436	sc = arg1;
1437	lro_cnt = sc->lro_cnt;
1438	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1439	if (err != 0)
1440		return err;
1441
1442	if (lro_cnt == sc->lro_cnt)
1443		return 0;
1444
1445	if (lro_cnt > 128)
1446		return EINVAL;
1447
1448	mtx_lock(&sc->driver_mtx);
1449	err = mxge_change_lro_locked(sc, lro_cnt);
1450	mtx_unlock(&sc->driver_mtx);
1451	return err;
1452}
1453
1454static int
1455mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1456{
1457        int err;
1458
1459        if (arg1 == NULL)
1460                return EFAULT;
1461        arg2 = be32toh(*(int *)arg1);
1462        arg1 = NULL;
1463        err = sysctl_handle_int(oidp, arg1, arg2, req);
1464
1465        return err;
1466}
1467
1468static void
1469mxge_rem_sysctls(mxge_softc_t *sc)
1470{
1471	struct mxge_slice_state *ss;
1472	int slice;
1473
1474	if (sc->slice_sysctl_tree == NULL)
1475		return;
1476
1477	for (slice = 0; slice < sc->num_slices; slice++) {
1478		ss = &sc->ss[slice];
1479		if (ss == NULL || ss->sysctl_tree == NULL)
1480			continue;
1481		sysctl_ctx_free(&ss->sysctl_ctx);
1482		ss->sysctl_tree = NULL;
1483	}
1484	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1485	sc->slice_sysctl_tree = NULL;
1486}
1487
1488static void
1489mxge_add_sysctls(mxge_softc_t *sc)
1490{
1491	struct sysctl_ctx_list *ctx;
1492	struct sysctl_oid_list *children;
1493	mcp_irq_data_t *fw;
1494	struct mxge_slice_state *ss;
1495	int slice;
1496	char slice_num[8];
1497
1498	ctx = device_get_sysctl_ctx(sc->dev);
1499	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1500	fw = sc->ss[0].fw_stats;
1501
1502	/* random information */
1503	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1504		       "firmware_version",
1505		       CTLFLAG_RD, &sc->fw_version,
1506		       0, "firmware version");
1507	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1508		       "serial_number",
1509		       CTLFLAG_RD, &sc->serial_number_string,
1510		       0, "serial number");
1511	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1512		       "product_code",
1513		       CTLFLAG_RD, &sc->product_code_string,
1514		       0, "product_code");
1515	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1516		       "pcie_link_width",
1517		       CTLFLAG_RD, &sc->link_width,
1518		       0, "tx_boundary");
1519	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1520		       "tx_boundary",
1521		       CTLFLAG_RD, &sc->tx_boundary,
1522		       0, "tx_boundary");
1523	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1524		       "write_combine",
1525		       CTLFLAG_RD, &sc->wc,
1526		       0, "write combining PIO?");
1527	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1528		       "read_dma_MBs",
1529		       CTLFLAG_RD, &sc->read_dma,
1530		       0, "DMA Read speed in MB/s");
1531	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1532		       "write_dma_MBs",
1533		       CTLFLAG_RD, &sc->write_dma,
1534		       0, "DMA Write speed in MB/s");
1535	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1536		       "read_write_dma_MBs",
1537		       CTLFLAG_RD, &sc->read_write_dma,
1538		       0, "DMA concurrent Read/Write speed in MB/s");
1539	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1540		       "watchdog_resets",
1541		       CTLFLAG_RD, &sc->watchdog_resets,
1542		       0, "Number of times NIC was reset");
1543
1544
1545	/* performance related tunables */
1546	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1547			"intr_coal_delay",
1548			CTLTYPE_INT|CTLFLAG_RW, sc,
1549			0, mxge_change_intr_coal,
1550			"I", "interrupt coalescing delay in usecs");
1551
1552	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1553			"throttle",
1554			CTLTYPE_INT|CTLFLAG_RW, sc,
1555			0, mxge_change_throttle,
1556			"I", "transmit throttling");
1557
1558	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1559			"flow_control_enabled",
1560			CTLTYPE_INT|CTLFLAG_RW, sc,
1561			0, mxge_change_flow_control,
1562			"I", "interrupt coalescing delay in usecs");
1563
1564	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1565		       "deassert_wait",
1566		       CTLFLAG_RW, &mxge_deassert_wait,
1567		       0, "Wait for IRQ line to go low in ihandler");
1568
1569	/* stats block from firmware is in network byte order.
1570	   Need to swap it */
1571	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1572			"link_up",
1573			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1574			0, mxge_handle_be32,
1575			"I", "link up");
1576	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1577			"rdma_tags_available",
1578			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1579			0, mxge_handle_be32,
1580			"I", "rdma_tags_available");
1581	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1582			"dropped_bad_crc32",
1583			CTLTYPE_INT|CTLFLAG_RD,
1584			&fw->dropped_bad_crc32,
1585			0, mxge_handle_be32,
1586			"I", "dropped_bad_crc32");
1587	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1588			"dropped_bad_phy",
1589			CTLTYPE_INT|CTLFLAG_RD,
1590			&fw->dropped_bad_phy,
1591			0, mxge_handle_be32,
1592			"I", "dropped_bad_phy");
1593	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1594			"dropped_link_error_or_filtered",
1595			CTLTYPE_INT|CTLFLAG_RD,
1596			&fw->dropped_link_error_or_filtered,
1597			0, mxge_handle_be32,
1598			"I", "dropped_link_error_or_filtered");
1599	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1600			"dropped_link_overflow",
1601			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1602			0, mxge_handle_be32,
1603			"I", "dropped_link_overflow");
1604	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1605			"dropped_multicast_filtered",
1606			CTLTYPE_INT|CTLFLAG_RD,
1607			&fw->dropped_multicast_filtered,
1608			0, mxge_handle_be32,
1609			"I", "dropped_multicast_filtered");
1610	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1611			"dropped_no_big_buffer",
1612			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1613			0, mxge_handle_be32,
1614			"I", "dropped_no_big_buffer");
1615	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1616			"dropped_no_small_buffer",
1617			CTLTYPE_INT|CTLFLAG_RD,
1618			&fw->dropped_no_small_buffer,
1619			0, mxge_handle_be32,
1620			"I", "dropped_no_small_buffer");
1621	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1622			"dropped_overrun",
1623			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1624			0, mxge_handle_be32,
1625			"I", "dropped_overrun");
1626	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1627			"dropped_pause",
1628			CTLTYPE_INT|CTLFLAG_RD,
1629			&fw->dropped_pause,
1630			0, mxge_handle_be32,
1631			"I", "dropped_pause");
1632	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1633			"dropped_runt",
1634			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1635			0, mxge_handle_be32,
1636			"I", "dropped_runt");
1637
1638	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1639			"dropped_unicast_filtered",
1640			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1641			0, mxge_handle_be32,
1642			"I", "dropped_unicast_filtered");
1643
1644	/* verbose printing? */
1645	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1646		       "verbose",
1647		       CTLFLAG_RW, &mxge_verbose,
1648		       0, "verbose printing");
1649
1650	/* lro */
1651	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1652			"lro_cnt",
1653			CTLTYPE_INT|CTLFLAG_RW, sc,
1654			0, mxge_change_lro,
1655			"I", "number of lro merge queues");
1656
1657
1658	/* add counters exported for debugging from all slices */
1659	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1660	sc->slice_sysctl_tree =
1661		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1662				"slice", CTLFLAG_RD, 0, "");
1663
1664	for (slice = 0; slice < sc->num_slices; slice++) {
1665		ss = &sc->ss[slice];
1666		sysctl_ctx_init(&ss->sysctl_ctx);
1667		ctx = &ss->sysctl_ctx;
1668		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1669		sprintf(slice_num, "%d", slice);
1670		ss->sysctl_tree =
1671			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1672					CTLFLAG_RD, 0, "");
1673		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1674		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1675			       "rx_small_cnt",
1676			       CTLFLAG_RD, &ss->rx_small.cnt,
1677			       0, "rx_small_cnt");
1678		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1679			       "rx_big_cnt",
1680			       CTLFLAG_RD, &ss->rx_big.cnt,
1681			       0, "rx_small_cnt");
1682		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1683			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1684			       0, "number of lro merge queues flushed");
1685
1686		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1687			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1688			       0, "number of frames appended to lro merge"
1689			       "queues");
1690
1691#ifndef IFNET_BUF_RING
1692		/* only transmit from slice 0 for now */
1693		if (slice > 0)
1694			continue;
1695#endif
1696		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1697			       "tx_req",
1698			       CTLFLAG_RD, &ss->tx.req,
1699			       0, "tx_req");
1700
1701		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1702			       "tx_done",
1703			       CTLFLAG_RD, &ss->tx.done,
1704			       0, "tx_done");
1705		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1706			       "tx_pkt_done",
1707			       CTLFLAG_RD, &ss->tx.pkt_done,
1708			       0, "tx_done");
1709		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1710			       "tx_stall",
1711			       CTLFLAG_RD, &ss->tx.stall,
1712			       0, "tx_stall");
1713		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1714			       "tx_wake",
1715			       CTLFLAG_RD, &ss->tx.wake,
1716			       0, "tx_wake");
1717		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1718			       "tx_defrag",
1719			       CTLFLAG_RD, &ss->tx.defrag,
1720			       0, "tx_defrag");
1721		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1722			       "tx_queue_active",
1723			       CTLFLAG_RD, &ss->tx.queue_active,
1724			       0, "tx_queue_active");
1725		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1726			       "tx_activate",
1727			       CTLFLAG_RD, &ss->tx.activate,
1728			       0, "tx_activate");
1729		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1730			       "tx_deactivate",
1731			       CTLFLAG_RD, &ss->tx.deactivate,
1732			       0, "tx_deactivate");
1733	}
1734}
1735
1736/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1737   backwards one at a time and handle ring wraps */
1738
1739static inline void
1740mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1741			    mcp_kreq_ether_send_t *src, int cnt)
1742{
1743        int idx, starting_slot;
1744        starting_slot = tx->req;
1745        while (cnt > 1) {
1746                cnt--;
1747                idx = (starting_slot + cnt) & tx->mask;
1748                mxge_pio_copy(&tx->lanai[idx],
1749			      &src[cnt], sizeof(*src));
1750                wmb();
1751        }
1752}
1753
1754/*
1755 * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1756 * at most 32 bytes at a time, so as to avoid involving the software
1757 * pio handler in the nic.   We re-write the first segment's flags
1758 * to mark them valid only after writing the entire chain
1759 */
1760
1761static inline void
1762mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1763                  int cnt)
1764{
1765        int idx, i;
1766        uint32_t *src_ints;
1767	volatile uint32_t *dst_ints;
1768        mcp_kreq_ether_send_t *srcp;
1769	volatile mcp_kreq_ether_send_t *dstp, *dst;
1770	uint8_t last_flags;
1771
1772        idx = tx->req & tx->mask;
1773
1774	last_flags = src->flags;
1775	src->flags = 0;
1776        wmb();
1777        dst = dstp = &tx->lanai[idx];
1778        srcp = src;
1779
1780        if ((idx + cnt) < tx->mask) {
1781                for (i = 0; i < (cnt - 1); i += 2) {
1782                        mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1783                        wmb(); /* force write every 32 bytes */
1784                        srcp += 2;
1785                        dstp += 2;
1786                }
1787        } else {
1788                /* submit all but the first request, and ensure
1789                   that it is submitted below */
1790                mxge_submit_req_backwards(tx, src, cnt);
1791                i = 0;
1792        }
1793        if (i < cnt) {
1794                /* submit the first request */
1795                mxge_pio_copy(dstp, srcp, sizeof(*src));
1796                wmb(); /* barrier before setting valid flag */
1797        }
1798
1799        /* re-write the last 32-bits with the valid flags */
1800        src->flags = last_flags;
1801        src_ints = (uint32_t *)src;
1802        src_ints+=3;
1803        dst_ints = (volatile uint32_t *)dst;
1804        dst_ints+=3;
1805        *dst_ints =  *src_ints;
1806        tx->req += cnt;
1807        wmb();
1808}
1809
1810#if IFCAP_TSO4
1811
1812static void
1813mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1814	       int busdma_seg_cnt, int ip_off)
1815{
1816	mxge_tx_ring_t *tx;
1817	mcp_kreq_ether_send_t *req;
1818	bus_dma_segment_t *seg;
1819	struct ip *ip;
1820	struct tcphdr *tcp;
1821	uint32_t low, high_swapped;
1822	int len, seglen, cum_len, cum_len_next;
1823	int next_is_first, chop, cnt, rdma_count, small;
1824	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1825	uint8_t flags, flags_next;
1826	static int once;
1827
1828	mss = m->m_pkthdr.tso_segsz;
1829
1830	/* negative cum_len signifies to the
1831	 * send loop that we are still in the
1832	 * header portion of the TSO packet.
1833	 */
1834
1835	/* ensure we have the ethernet, IP and TCP
1836	   header together in the first mbuf, copy
1837	   it to a scratch buffer if not */
1838	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1839		m_copydata(m, 0, ip_off + sizeof (*ip),
1840			   ss->scratch);
1841		ip = (struct ip *)(ss->scratch + ip_off);
1842	} else {
1843		ip = (struct ip *)(mtod(m, char *) + ip_off);
1844	}
1845	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1846			    + sizeof (*tcp))) {
1847		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1848			   + sizeof (*tcp),  ss->scratch);
1849		ip = (struct ip *)(mtod(m, char *) + ip_off);
1850	}
1851
1852	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1853	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1854
1855	/* TSO implies checksum offload on this hardware */
1856	cksum_offset = ip_off + (ip->ip_hl << 2);
1857	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1858
1859
1860	/* for TSO, pseudo_hdr_offset holds mss.
1861	 * The firmware figures out where to put
1862	 * the checksum by parsing the header. */
1863	pseudo_hdr_offset = htobe16(mss);
1864
1865	tx = &ss->tx;
1866	req = tx->req_list;
1867	seg = tx->seg_list;
1868	cnt = 0;
1869	rdma_count = 0;
1870	/* "rdma_count" is the number of RDMAs belonging to the
1871	 * current packet BEFORE the current send request. For
1872	 * non-TSO packets, this is equal to "count".
1873	 * For TSO packets, rdma_count needs to be reset
1874	 * to 0 after a segment cut.
1875	 *
1876	 * The rdma_count field of the send request is
1877	 * the number of RDMAs of the packet starting at
1878	 * that request. For TSO send requests with one ore more cuts
1879	 * in the middle, this is the number of RDMAs starting
1880	 * after the last cut in the request. All previous
1881	 * segments before the last cut implicitly have 1 RDMA.
1882	 *
1883	 * Since the number of RDMAs is not known beforehand,
1884	 * it must be filled-in retroactively - after each
1885	 * segmentation cut or at the end of the entire packet.
1886	 */
1887
1888	while (busdma_seg_cnt) {
1889		/* Break the busdma segment up into pieces*/
1890		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1891		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1892		len = seg->ds_len;
1893
1894		while (len) {
1895			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1896			seglen = len;
1897			cum_len_next = cum_len + seglen;
1898			(req-rdma_count)->rdma_count = rdma_count + 1;
1899			if (__predict_true(cum_len >= 0)) {
1900				/* payload */
1901				chop = (cum_len_next > mss);
1902				cum_len_next = cum_len_next % mss;
1903				next_is_first = (cum_len_next == 0);
1904				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1905				flags_next |= next_is_first *
1906					MXGEFW_FLAGS_FIRST;
1907				rdma_count |= -(chop | next_is_first);
1908				rdma_count += chop & !next_is_first;
1909			} else if (cum_len_next >= 0) {
1910				/* header ends */
1911				rdma_count = -1;
1912				cum_len_next = 0;
1913				seglen = -cum_len;
1914				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1915				flags_next = MXGEFW_FLAGS_TSO_PLD |
1916					MXGEFW_FLAGS_FIRST |
1917					(small * MXGEFW_FLAGS_SMALL);
1918			    }
1919
1920			req->addr_high = high_swapped;
1921			req->addr_low = htobe32(low);
1922			req->pseudo_hdr_offset = pseudo_hdr_offset;
1923			req->pad = 0;
1924			req->rdma_count = 1;
1925			req->length = htobe16(seglen);
1926			req->cksum_offset = cksum_offset;
1927			req->flags = flags | ((cum_len & 1) *
1928					      MXGEFW_FLAGS_ALIGN_ODD);
1929			low += seglen;
1930			len -= seglen;
1931			cum_len = cum_len_next;
1932			flags = flags_next;
1933			req++;
1934			cnt++;
1935			rdma_count++;
1936			if (__predict_false(cksum_offset > seglen))
1937				cksum_offset -= seglen;
1938			else
1939				cksum_offset = 0;
1940			if (__predict_false(cnt > tx->max_desc))
1941				goto drop;
1942		}
1943		busdma_seg_cnt--;
1944		seg++;
1945	}
1946	(req-rdma_count)->rdma_count = rdma_count;
1947
1948	do {
1949		req--;
1950		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1951	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1952
1953	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1954	mxge_submit_req(tx, tx->req_list, cnt);
1955#ifdef IFNET_BUF_RING
1956	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1957		/* tell the NIC to start polling this slice */
1958		*tx->send_go = 1;
1959		tx->queue_active = 1;
1960		tx->activate++;
1961		wmb();
1962	}
1963#endif
1964	return;
1965
1966drop:
1967	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1968	m_freem(m);
1969	ss->oerrors++;
1970	if (!once) {
1971		printf("tx->max_desc exceeded via TSO!\n");
1972		printf("mss = %d, %ld, %d!\n", mss,
1973		       (long)seg - (long)tx->seg_list, tx->max_desc);
1974		once = 1;
1975	}
1976	return;
1977
1978}
1979
1980#endif /* IFCAP_TSO4 */
1981
1982#ifdef MXGE_NEW_VLAN_API
1983/*
1984 * We reproduce the software vlan tag insertion from
1985 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1986 * vlan tag insertion. We need to advertise this in order to have the
1987 * vlan interface respect our csum offload flags.
1988 */
1989static struct mbuf *
1990mxge_vlan_tag_insert(struct mbuf *m)
1991{
1992	struct ether_vlan_header *evl;
1993
1994	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1995	if (__predict_false(m == NULL))
1996		return NULL;
1997	if (m->m_len < sizeof(*evl)) {
1998		m = m_pullup(m, sizeof(*evl));
1999		if (__predict_false(m == NULL))
2000			return NULL;
2001	}
2002	/*
2003	 * Transform the Ethernet header into an Ethernet header
2004	 * with 802.1Q encapsulation.
2005	 */
2006	evl = mtod(m, struct ether_vlan_header *);
2007	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2008	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2009	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2010	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2011	m->m_flags &= ~M_VLANTAG;
2012	return m;
2013}
2014#endif /* MXGE_NEW_VLAN_API */
2015
2016static void
2017mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2018{
2019	mxge_softc_t *sc;
2020	mcp_kreq_ether_send_t *req;
2021	bus_dma_segment_t *seg;
2022	struct mbuf *m_tmp;
2023	struct ifnet *ifp;
2024	mxge_tx_ring_t *tx;
2025	struct ip *ip;
2026	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2027	uint16_t pseudo_hdr_offset;
2028        uint8_t flags, cksum_offset;
2029
2030
2031	sc = ss->sc;
2032	ifp = sc->ifp;
2033	tx = &ss->tx;
2034
2035	ip_off = sizeof (struct ether_header);
2036#ifdef MXGE_NEW_VLAN_API
2037	if (m->m_flags & M_VLANTAG) {
2038		m = mxge_vlan_tag_insert(m);
2039		if (__predict_false(m == NULL))
2040			goto drop;
2041		ip_off += ETHER_VLAN_ENCAP_LEN;
2042	}
2043#endif
2044	/* (try to) map the frame for DMA */
2045	idx = tx->req & tx->mask;
2046	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2047				      m, tx->seg_list, &cnt,
2048				      BUS_DMA_NOWAIT);
2049	if (__predict_false(err == EFBIG)) {
2050		/* Too many segments in the chain.  Try
2051		   to defrag */
2052		m_tmp = m_defrag(m, M_NOWAIT);
2053		if (m_tmp == NULL) {
2054			goto drop;
2055		}
2056		ss->tx.defrag++;
2057		m = m_tmp;
2058		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2059					      tx->info[idx].map,
2060					      m, tx->seg_list, &cnt,
2061					      BUS_DMA_NOWAIT);
2062	}
2063	if (__predict_false(err != 0)) {
2064		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2065			      " packet len = %d\n", err, m->m_pkthdr.len);
2066		goto drop;
2067	}
2068	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2069			BUS_DMASYNC_PREWRITE);
2070	tx->info[idx].m = m;
2071
2072#if IFCAP_TSO4
2073	/* TSO is different enough, we handle it in another routine */
2074	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2075		mxge_encap_tso(ss, m, cnt, ip_off);
2076		return;
2077	}
2078#endif
2079
2080	req = tx->req_list;
2081	cksum_offset = 0;
2082	pseudo_hdr_offset = 0;
2083	flags = MXGEFW_FLAGS_NO_TSO;
2084
2085	/* checksum offloading? */
2086	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2087		/* ensure ip header is in first mbuf, copy
2088		   it to a scratch buffer if not */
2089		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2090			m_copydata(m, 0, ip_off + sizeof (*ip),
2091				   ss->scratch);
2092			ip = (struct ip *)(ss->scratch + ip_off);
2093		} else {
2094			ip = (struct ip *)(mtod(m, char *) + ip_off);
2095		}
2096		cksum_offset = ip_off + (ip->ip_hl << 2);
2097		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2098		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2099		req->cksum_offset = cksum_offset;
2100		flags |= MXGEFW_FLAGS_CKSUM;
2101		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2102	} else {
2103		odd_flag = 0;
2104	}
2105	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2106		flags |= MXGEFW_FLAGS_SMALL;
2107
2108	/* convert segments into a request list */
2109	cum_len = 0;
2110	seg = tx->seg_list;
2111	req->flags = MXGEFW_FLAGS_FIRST;
2112	for (i = 0; i < cnt; i++) {
2113		req->addr_low =
2114			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2115		req->addr_high =
2116			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2117		req->length = htobe16(seg->ds_len);
2118		req->cksum_offset = cksum_offset;
2119		if (cksum_offset > seg->ds_len)
2120			cksum_offset -= seg->ds_len;
2121		else
2122			cksum_offset = 0;
2123		req->pseudo_hdr_offset = pseudo_hdr_offset;
2124		req->pad = 0; /* complete solid 16-byte block */
2125		req->rdma_count = 1;
2126		req->flags |= flags | ((cum_len & 1) * odd_flag);
2127		cum_len += seg->ds_len;
2128		seg++;
2129		req++;
2130		req->flags = 0;
2131	}
2132	req--;
2133	/* pad runts to 60 bytes */
2134	if (cum_len < 60) {
2135		req++;
2136		req->addr_low =
2137			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2138		req->addr_high =
2139			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2140		req->length = htobe16(60 - cum_len);
2141		req->cksum_offset = 0;
2142		req->pseudo_hdr_offset = pseudo_hdr_offset;
2143		req->pad = 0; /* complete solid 16-byte block */
2144		req->rdma_count = 1;
2145		req->flags |= flags | ((cum_len & 1) * odd_flag);
2146		cnt++;
2147	}
2148
2149	tx->req_list[0].rdma_count = cnt;
2150#if 0
2151	/* print what the firmware will see */
2152	for (i = 0; i < cnt; i++) {
2153		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2154		    "cso:%d, flags:0x%x, rdma:%d\n",
2155		    i, (int)ntohl(tx->req_list[i].addr_high),
2156		    (int)ntohl(tx->req_list[i].addr_low),
2157		    (int)ntohs(tx->req_list[i].length),
2158		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2159		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2160		    tx->req_list[i].rdma_count);
2161	}
2162	printf("--------------\n");
2163#endif
2164	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2165	mxge_submit_req(tx, tx->req_list, cnt);
2166#ifdef IFNET_BUF_RING
2167	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2168		/* tell the NIC to start polling this slice */
2169		*tx->send_go = 1;
2170		tx->queue_active = 1;
2171		tx->activate++;
2172		wmb();
2173	}
2174#endif
2175	return;
2176
2177drop:
2178	m_freem(m);
2179	ss->oerrors++;
2180	return;
2181}
2182
2183#ifdef IFNET_BUF_RING
2184static void
2185mxge_qflush(struct ifnet *ifp)
2186{
2187	mxge_softc_t *sc = ifp->if_softc;
2188	mxge_tx_ring_t *tx;
2189	struct mbuf *m;
2190	int slice;
2191
2192	for (slice = 0; slice < sc->num_slices; slice++) {
2193		tx = &sc->ss[slice].tx;
2194		mtx_lock(&tx->mtx);
2195		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2196			m_freem(m);
2197		mtx_unlock(&tx->mtx);
2198	}
2199	if_qflush(ifp);
2200}
2201
2202static inline void
2203mxge_start_locked(struct mxge_slice_state *ss)
2204{
2205	mxge_softc_t *sc;
2206	struct mbuf *m;
2207	struct ifnet *ifp;
2208	mxge_tx_ring_t *tx;
2209
2210	sc = ss->sc;
2211	ifp = sc->ifp;
2212	tx = &ss->tx;
2213
2214	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2215		m = drbr_dequeue(ifp, tx->br);
2216		if (m == NULL) {
2217			return;
2218		}
2219		/* let BPF see it */
2220		BPF_MTAP(ifp, m);
2221
2222		/* give it to the nic */
2223		mxge_encap(ss, m);
2224	}
2225	/* ran out of transmit slots */
2226	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2227	    && (!drbr_empty(ifp, tx->br))) {
2228		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2229		tx->stall++;
2230	}
2231}
2232
2233static int
2234mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2235{
2236	mxge_softc_t *sc;
2237	struct ifnet *ifp;
2238	mxge_tx_ring_t *tx;
2239	int err;
2240
2241	sc = ss->sc;
2242	ifp = sc->ifp;
2243	tx = &ss->tx;
2244
2245	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2246	    IFF_DRV_RUNNING) {
2247		err = drbr_enqueue(ifp, tx->br, m);
2248		return (err);
2249	}
2250
2251	if (drbr_empty(ifp, tx->br) &&
2252	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2253		/* let BPF see it */
2254		BPF_MTAP(ifp, m);
2255		/* give it to the nic */
2256		mxge_encap(ss, m);
2257	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2258		return (err);
2259	}
2260	if (!drbr_empty(ifp, tx->br))
2261		mxge_start_locked(ss);
2262	return (0);
2263}
2264
2265static int
2266mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2267{
2268	mxge_softc_t *sc = ifp->if_softc;
2269	struct mxge_slice_state *ss;
2270	mxge_tx_ring_t *tx;
2271	int err = 0;
2272	int slice;
2273
2274	slice = m->m_pkthdr.flowid;
2275	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2276
2277	ss = &sc->ss[slice];
2278	tx = &ss->tx;
2279
2280	if (mtx_trylock(&tx->mtx)) {
2281		err = mxge_transmit_locked(ss, m);
2282		mtx_unlock(&tx->mtx);
2283	} else {
2284		err = drbr_enqueue(ifp, tx->br, m);
2285	}
2286
2287	return (err);
2288}
2289
2290#else
2291
2292static inline void
2293mxge_start_locked(struct mxge_slice_state *ss)
2294{
2295	mxge_softc_t *sc;
2296	struct mbuf *m;
2297	struct ifnet *ifp;
2298	mxge_tx_ring_t *tx;
2299
2300	sc = ss->sc;
2301	ifp = sc->ifp;
2302	tx = &ss->tx;
2303	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2304		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2305		if (m == NULL) {
2306			return;
2307		}
2308		/* let BPF see it */
2309		BPF_MTAP(ifp, m);
2310
2311		/* give it to the nic */
2312		mxge_encap(ss, m);
2313	}
2314	/* ran out of transmit slots */
2315	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2316		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2317		tx->stall++;
2318	}
2319}
2320#endif
2321static void
2322mxge_start(struct ifnet *ifp)
2323{
2324	mxge_softc_t *sc = ifp->if_softc;
2325	struct mxge_slice_state *ss;
2326
2327	/* only use the first slice for now */
2328	ss = &sc->ss[0];
2329	mtx_lock(&ss->tx.mtx);
2330	mxge_start_locked(ss);
2331	mtx_unlock(&ss->tx.mtx);
2332}
2333
2334/*
2335 * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2336 * at most 32 bytes at a time, so as to avoid involving the software
2337 * pio handler in the nic.   We re-write the first segment's low
2338 * DMA address to mark it valid only after we write the entire chunk
2339 * in a burst
2340 */
2341static inline void
2342mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2343		mcp_kreq_ether_recv_t *src)
2344{
2345	uint32_t low;
2346
2347	low = src->addr_low;
2348	src->addr_low = 0xffffffff;
2349	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2350	wmb();
2351	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2352	wmb();
2353	src->addr_low = low;
2354	dst->addr_low = low;
2355	wmb();
2356}
2357
2358static int
2359mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2360{
2361	bus_dma_segment_t seg;
2362	struct mbuf *m;
2363	mxge_rx_ring_t *rx = &ss->rx_small;
2364	int cnt, err;
2365
2366	m = m_gethdr(M_DONTWAIT, MT_DATA);
2367	if (m == NULL) {
2368		rx->alloc_fail++;
2369		err = ENOBUFS;
2370		goto done;
2371	}
2372	m->m_len = MHLEN;
2373	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2374				      &seg, &cnt, BUS_DMA_NOWAIT);
2375	if (err != 0) {
2376		m_free(m);
2377		goto done;
2378	}
2379	rx->info[idx].m = m;
2380	rx->shadow[idx].addr_low =
2381		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2382	rx->shadow[idx].addr_high =
2383		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2384
2385done:
2386	if ((idx & 7) == 7)
2387		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2388	return err;
2389}
2390
2391static int
2392mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2393{
2394	bus_dma_segment_t seg[3];
2395	struct mbuf *m;
2396	mxge_rx_ring_t *rx = &ss->rx_big;
2397	int cnt, err, i;
2398
2399	if (rx->cl_size == MCLBYTES)
2400		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2401	else
2402		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2403	if (m == NULL) {
2404		rx->alloc_fail++;
2405		err = ENOBUFS;
2406		goto done;
2407	}
2408	m->m_len = rx->mlen;
2409	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2410				      seg, &cnt, BUS_DMA_NOWAIT);
2411	if (err != 0) {
2412		m_free(m);
2413		goto done;
2414	}
2415	rx->info[idx].m = m;
2416	rx->shadow[idx].addr_low =
2417		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2418	rx->shadow[idx].addr_high =
2419		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2420
2421#if MXGE_VIRT_JUMBOS
2422	for (i = 1; i < cnt; i++) {
2423		rx->shadow[idx + i].addr_low =
2424			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2425		rx->shadow[idx + i].addr_high =
2426			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2427       }
2428#endif
2429
2430done:
2431       for (i = 0; i < rx->nbufs; i++) {
2432		if ((idx & 7) == 7) {
2433			mxge_submit_8rx(&rx->lanai[idx - 7],
2434					&rx->shadow[idx - 7]);
2435		}
2436		idx++;
2437	}
2438	return err;
2439}
2440
2441/*
2442 *  Myri10GE hardware checksums are not valid if the sender
2443 *  padded the frame with non-zero padding.  This is because
2444 *  the firmware just does a simple 16-bit 1s complement
2445 *  checksum across the entire frame, excluding the first 14
2446 *  bytes.  It is best to simply to check the checksum and
2447 *  tell the stack about it only if the checksum is good
2448 */
2449
2450static inline uint16_t
2451mxge_rx_csum(struct mbuf *m, int csum)
2452{
2453	struct ether_header *eh;
2454	struct ip *ip;
2455	uint16_t c;
2456
2457	eh = mtod(m, struct ether_header *);
2458
2459	/* only deal with IPv4 TCP & UDP for now */
2460	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2461		return 1;
2462	ip = (struct ip *)(eh + 1);
2463	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2464			    ip->ip_p != IPPROTO_UDP))
2465		return 1;
2466#ifdef INET
2467	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2468		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2469			    - (ip->ip_hl << 2) + ip->ip_p));
2470#else
2471	c = 1;
2472#endif
2473	c ^= 0xffff;
2474	return (c);
2475}
2476
2477static void
2478mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2479{
2480	struct ether_vlan_header *evl;
2481	struct ether_header *eh;
2482	uint32_t partial;
2483
2484	evl = mtod(m, struct ether_vlan_header *);
2485	eh = mtod(m, struct ether_header *);
2486
2487	/*
2488	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2489	 * after what the firmware thought was the end of the ethernet
2490	 * header.
2491	 */
2492
2493	/* put checksum into host byte order */
2494	*csum = ntohs(*csum);
2495	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2496	(*csum) += ~partial;
2497	(*csum) +=  ((*csum) < ~partial);
2498	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2499	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2500
2501	/* restore checksum to network byte order;
2502	   later consumers expect this */
2503	*csum = htons(*csum);
2504
2505	/* save the tag */
2506#ifdef MXGE_NEW_VLAN_API
2507	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2508#else
2509	{
2510		struct m_tag *mtag;
2511		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2512				   M_NOWAIT);
2513		if (mtag == NULL)
2514			return;
2515		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2516		m_tag_prepend(m, mtag);
2517	}
2518
2519#endif
2520	m->m_flags |= M_VLANTAG;
2521
2522	/*
2523	 * Remove the 802.1q header by copying the Ethernet
2524	 * addresses over it and adjusting the beginning of
2525	 * the data in the mbuf.  The encapsulated Ethernet
2526	 * type field is already in place.
2527	 */
2528	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2529	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2530	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2531}
2532
2533
2534static inline void
2535mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2536{
2537	mxge_softc_t *sc;
2538	struct ifnet *ifp;
2539	struct mbuf *m;
2540	struct ether_header *eh;
2541	mxge_rx_ring_t *rx;
2542	bus_dmamap_t old_map;
2543	int idx;
2544	uint16_t tcpudp_csum;
2545
2546	sc = ss->sc;
2547	ifp = sc->ifp;
2548	rx = &ss->rx_big;
2549	idx = rx->cnt & rx->mask;
2550	rx->cnt += rx->nbufs;
2551	/* save a pointer to the received mbuf */
2552	m = rx->info[idx].m;
2553	/* try to replace the received mbuf */
2554	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2555		/* drop the frame -- the old mbuf is re-cycled */
2556		ifp->if_ierrors++;
2557		return;
2558	}
2559
2560	/* unmap the received buffer */
2561	old_map = rx->info[idx].map;
2562	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2563	bus_dmamap_unload(rx->dmat, old_map);
2564
2565	/* swap the bus_dmamap_t's */
2566	rx->info[idx].map = rx->extra_map;
2567	rx->extra_map = old_map;
2568
2569	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2570	 * aligned */
2571	m->m_data += MXGEFW_PAD;
2572
2573	m->m_pkthdr.rcvif = ifp;
2574	m->m_len = m->m_pkthdr.len = len;
2575	ss->ipackets++;
2576	eh = mtod(m, struct ether_header *);
2577	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2578		mxge_vlan_tag_remove(m, &csum);
2579	}
2580	/* if the checksum is valid, mark it in the mbuf header */
2581	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2582		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2583			return;
2584		/* otherwise, it was a UDP frame, or a TCP frame which
2585		   we could not do LRO on.  Tell the stack that the
2586		   checksum is good */
2587		m->m_pkthdr.csum_data = 0xffff;
2588		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2589	}
2590	/* flowid only valid if RSS hashing is enabled */
2591	if (sc->num_slices > 1) {
2592		m->m_pkthdr.flowid = (ss - sc->ss);
2593		m->m_flags |= M_FLOWID;
2594	}
2595	/* pass the frame up the stack */
2596	(*ifp->if_input)(ifp, m);
2597}
2598
2599static inline void
2600mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2601{
2602	mxge_softc_t *sc;
2603	struct ifnet *ifp;
2604	struct ether_header *eh;
2605	struct mbuf *m;
2606	mxge_rx_ring_t *rx;
2607	bus_dmamap_t old_map;
2608	int idx;
2609	uint16_t tcpudp_csum;
2610
2611	sc = ss->sc;
2612	ifp = sc->ifp;
2613	rx = &ss->rx_small;
2614	idx = rx->cnt & rx->mask;
2615	rx->cnt++;
2616	/* save a pointer to the received mbuf */
2617	m = rx->info[idx].m;
2618	/* try to replace the received mbuf */
2619	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2620		/* drop the frame -- the old mbuf is re-cycled */
2621		ifp->if_ierrors++;
2622		return;
2623	}
2624
2625	/* unmap the received buffer */
2626	old_map = rx->info[idx].map;
2627	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2628	bus_dmamap_unload(rx->dmat, old_map);
2629
2630	/* swap the bus_dmamap_t's */
2631	rx->info[idx].map = rx->extra_map;
2632	rx->extra_map = old_map;
2633
2634	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2635	 * aligned */
2636	m->m_data += MXGEFW_PAD;
2637
2638	m->m_pkthdr.rcvif = ifp;
2639	m->m_len = m->m_pkthdr.len = len;
2640	ss->ipackets++;
2641	eh = mtod(m, struct ether_header *);
2642	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2643		mxge_vlan_tag_remove(m, &csum);
2644	}
2645	/* if the checksum is valid, mark it in the mbuf header */
2646	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2647		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2648			return;
2649		/* otherwise, it was a UDP frame, or a TCP frame which
2650		   we could not do LRO on.  Tell the stack that the
2651		   checksum is good */
2652		m->m_pkthdr.csum_data = 0xffff;
2653		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2654	}
2655	/* flowid only valid if RSS hashing is enabled */
2656	if (sc->num_slices > 1) {
2657		m->m_pkthdr.flowid = (ss - sc->ss);
2658		m->m_flags |= M_FLOWID;
2659	}
2660	/* pass the frame up the stack */
2661	(*ifp->if_input)(ifp, m);
2662}
2663
2664static inline void
2665mxge_clean_rx_done(struct mxge_slice_state *ss)
2666{
2667	mxge_rx_done_t *rx_done = &ss->rx_done;
2668	int limit = 0;
2669	uint16_t length;
2670	uint16_t checksum;
2671
2672
2673	while (rx_done->entry[rx_done->idx].length != 0) {
2674		length = ntohs(rx_done->entry[rx_done->idx].length);
2675		rx_done->entry[rx_done->idx].length = 0;
2676		checksum = rx_done->entry[rx_done->idx].checksum;
2677		if (length <= (MHLEN - MXGEFW_PAD))
2678			mxge_rx_done_small(ss, length, checksum);
2679		else
2680			mxge_rx_done_big(ss, length, checksum);
2681		rx_done->cnt++;
2682		rx_done->idx = rx_done->cnt & rx_done->mask;
2683
2684		/* limit potential for livelock */
2685		if (__predict_false(++limit > rx_done->mask / 2))
2686			break;
2687	}
2688#ifdef INET
2689	while (!SLIST_EMPTY(&ss->lro_active)) {
2690		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2691		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2692		mxge_lro_flush(ss, lro);
2693	}
2694#endif
2695}
2696
2697
2698static inline void
2699mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2700{
2701	struct ifnet *ifp;
2702	mxge_tx_ring_t *tx;
2703	struct mbuf *m;
2704	bus_dmamap_t map;
2705	int idx;
2706	int *flags;
2707
2708	tx = &ss->tx;
2709	ifp = ss->sc->ifp;
2710	while (tx->pkt_done != mcp_idx) {
2711		idx = tx->done & tx->mask;
2712		tx->done++;
2713		m = tx->info[idx].m;
2714		/* mbuf and DMA map only attached to the first
2715		   segment per-mbuf */
2716		if (m != NULL) {
2717			ss->obytes += m->m_pkthdr.len;
2718			if (m->m_flags & M_MCAST)
2719				ss->omcasts++;
2720			ss->opackets++;
2721			tx->info[idx].m = NULL;
2722			map = tx->info[idx].map;
2723			bus_dmamap_unload(tx->dmat, map);
2724			m_freem(m);
2725		}
2726		if (tx->info[idx].flag) {
2727			tx->info[idx].flag = 0;
2728			tx->pkt_done++;
2729		}
2730	}
2731
2732	/* If we have space, clear IFF_OACTIVE to tell the stack that
2733           its OK to send packets */
2734#ifdef IFNET_BUF_RING
2735	flags = &ss->if_drv_flags;
2736#else
2737	flags = &ifp->if_drv_flags;
2738#endif
2739	mtx_lock(&ss->tx.mtx);
2740	if ((*flags) & IFF_DRV_OACTIVE &&
2741	    tx->req - tx->done < (tx->mask + 1)/4) {
2742		*(flags) &= ~IFF_DRV_OACTIVE;
2743		ss->tx.wake++;
2744		mxge_start_locked(ss);
2745	}
2746#ifdef IFNET_BUF_RING
2747	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2748		/* let the NIC stop polling this queue, since there
2749		 * are no more transmits pending */
2750		if (tx->req == tx->done) {
2751			*tx->send_stop = 1;
2752			tx->queue_active = 0;
2753			tx->deactivate++;
2754			wmb();
2755		}
2756	}
2757#endif
2758	mtx_unlock(&ss->tx.mtx);
2759
2760}
2761
2762static struct mxge_media_type mxge_xfp_media_types[] =
2763{
2764	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2765	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2766	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2767	{0,		(1 << 5),	"10GBASE-ER"},
2768	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2769	{0,		(1 << 3),	"10GBASE-SW"},
2770	{0,		(1 << 2),	"10GBASE-LW"},
2771	{0,		(1 << 1),	"10GBASE-EW"},
2772	{0,		(1 << 0),	"Reserved"}
2773};
2774static struct mxge_media_type mxge_sfp_media_types[] =
2775{
2776	{0,		(1 << 7),	"Reserved"},
2777	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2778	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2779	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"}
2780};
2781
2782static void
2783mxge_set_media(mxge_softc_t *sc, int type)
2784{
2785	sc->media_flags |= type;
2786	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2787	ifmedia_set(&sc->media, sc->media_flags);
2788}
2789
2790
2791/*
2792 * Determine the media type for a NIC.  Some XFPs will identify
2793 * themselves only when their link is up, so this is initiated via a
2794 * link up interrupt.  However, this can potentially take up to
2795 * several milliseconds, so it is run via the watchdog routine, rather
2796 * than in the interrupt handler itself.   This need only be done
2797 * once, not each time the link is up.
2798 */
2799static void
2800mxge_media_probe(mxge_softc_t *sc)
2801{
2802	mxge_cmd_t cmd;
2803	char *cage_type;
2804	char *ptr;
2805	struct mxge_media_type *mxge_media_types = NULL;
2806	int i, err, ms, mxge_media_type_entries;
2807	uint32_t byte;
2808
2809	sc->need_media_probe = 0;
2810
2811	/* if we've already set a media type, we're done */
2812	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2813		return;
2814
2815	/*
2816	 * parse the product code to deterimine the interface type
2817	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2818	 * after the 3rd dash in the driver's cached copy of the
2819	 * EEPROM's product code string.
2820	 */
2821	ptr = sc->product_code_string;
2822	if (ptr == NULL) {
2823		device_printf(sc->dev, "Missing product code\n");
2824	}
2825
2826	for (i = 0; i < 3; i++, ptr++) {
2827		ptr = index(ptr, '-');
2828		if (ptr == NULL) {
2829			device_printf(sc->dev,
2830				      "only %d dashes in PC?!?\n", i);
2831			return;
2832		}
2833	}
2834	if (*ptr == 'C') {
2835		/* -C is CX4 */
2836		mxge_set_media(sc, IFM_10G_CX4);
2837		return;
2838	}
2839	else if (*ptr == 'Q') {
2840		/* -Q is Quad Ribbon Fiber */
2841		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2842		/* FreeBSD has no media type for Quad ribbon fiber */
2843		return;
2844	}
2845
2846	if (*ptr == 'R') {
2847		/* -R is XFP */
2848		mxge_media_types = mxge_xfp_media_types;
2849		mxge_media_type_entries =
2850			sizeof (mxge_xfp_media_types) /
2851			sizeof (mxge_xfp_media_types[0]);
2852		byte = MXGE_XFP_COMPLIANCE_BYTE;
2853		cage_type = "XFP";
2854	}
2855
2856	if (*ptr == 'S' || *(ptr +1) == 'S') {
2857		/* -S or -2S is SFP+ */
2858		mxge_media_types = mxge_sfp_media_types;
2859		mxge_media_type_entries =
2860			sizeof (mxge_sfp_media_types) /
2861			sizeof (mxge_sfp_media_types[0]);
2862		cage_type = "SFP+";
2863		byte = 3;
2864	}
2865
2866	if (mxge_media_types == NULL) {
2867		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2868		return;
2869	}
2870
2871	/*
2872	 * At this point we know the NIC has an XFP cage, so now we
2873	 * try to determine what is in the cage by using the
2874	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2875	 * register.  We read just one byte, which may take over
2876	 * a millisecond
2877	 */
2878
2879	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2880	cmd.data1 = byte;
2881	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2882	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2883		device_printf(sc->dev, "failed to read XFP\n");
2884	}
2885	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2886		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2887	}
2888	if (err != MXGEFW_CMD_OK) {
2889		return;
2890	}
2891
2892	/* now we wait for the data to be cached */
2893	cmd.data0 = byte;
2894	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2895	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2896		DELAY(1000);
2897		cmd.data0 = byte;
2898		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2899	}
2900	if (err != MXGEFW_CMD_OK) {
2901		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2902			      cage_type, err, ms);
2903		return;
2904	}
2905
2906	if (cmd.data0 == mxge_media_types[0].bitmask) {
2907		if (mxge_verbose)
2908			device_printf(sc->dev, "%s:%s\n", cage_type,
2909				      mxge_media_types[0].name);
2910		mxge_set_media(sc, IFM_10G_CX4);
2911		return;
2912	}
2913	for (i = 1; i < mxge_media_type_entries; i++) {
2914		if (cmd.data0 & mxge_media_types[i].bitmask) {
2915			if (mxge_verbose)
2916				device_printf(sc->dev, "%s:%s\n",
2917					      cage_type,
2918					      mxge_media_types[i].name);
2919
2920			mxge_set_media(sc, mxge_media_types[i].flag);
2921			return;
2922		}
2923	}
2924	device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2925		      cmd.data0);
2926
2927	return;
2928}
2929
2930static void
2931mxge_intr(void *arg)
2932{
2933	struct mxge_slice_state *ss = arg;
2934	mxge_softc_t *sc = ss->sc;
2935	mcp_irq_data_t *stats = ss->fw_stats;
2936	mxge_tx_ring_t *tx = &ss->tx;
2937	mxge_rx_done_t *rx_done = &ss->rx_done;
2938	uint32_t send_done_count;
2939	uint8_t valid;
2940
2941
2942#ifndef IFNET_BUF_RING
2943	/* an interrupt on a non-zero slice is implicitly valid
2944	   since MSI-X irqs are not shared */
2945	if (ss != sc->ss) {
2946		mxge_clean_rx_done(ss);
2947		*ss->irq_claim = be32toh(3);
2948		return;
2949	}
2950#endif
2951
2952	/* make sure the DMA has finished */
2953	if (!stats->valid) {
2954		return;
2955	}
2956	valid = stats->valid;
2957
2958	if (sc->legacy_irq) {
2959		/* lower legacy IRQ  */
2960		*sc->irq_deassert = 0;
2961		if (!mxge_deassert_wait)
2962			/* don't wait for conf. that irq is low */
2963			stats->valid = 0;
2964	} else {
2965		stats->valid = 0;
2966	}
2967
2968	/* loop while waiting for legacy irq deassertion */
2969	do {
2970		/* check for transmit completes and receives */
2971		send_done_count = be32toh(stats->send_done_count);
2972		while ((send_done_count != tx->pkt_done) ||
2973		       (rx_done->entry[rx_done->idx].length != 0)) {
2974			if (send_done_count != tx->pkt_done)
2975				mxge_tx_done(ss, (int)send_done_count);
2976			mxge_clean_rx_done(ss);
2977			send_done_count = be32toh(stats->send_done_count);
2978		}
2979		if (sc->legacy_irq && mxge_deassert_wait)
2980			wmb();
2981	} while (*((volatile uint8_t *) &stats->valid));
2982
2983	/* fw link & error stats meaningful only on the first slice */
2984	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2985		if (sc->link_state != stats->link_up) {
2986			sc->link_state = stats->link_up;
2987			if (sc->link_state) {
2988				if_link_state_change(sc->ifp, LINK_STATE_UP);
2989				if (mxge_verbose)
2990					device_printf(sc->dev, "link up\n");
2991			} else {
2992				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2993				if (mxge_verbose)
2994					device_printf(sc->dev, "link down\n");
2995			}
2996			sc->need_media_probe = 1;
2997		}
2998		if (sc->rdma_tags_available !=
2999		    be32toh(stats->rdma_tags_available)) {
3000			sc->rdma_tags_available =
3001				be32toh(stats->rdma_tags_available);
3002			device_printf(sc->dev, "RDMA timed out! %d tags "
3003				      "left\n", sc->rdma_tags_available);
3004		}
3005
3006		if (stats->link_down) {
3007			sc->down_cnt += stats->link_down;
3008			sc->link_state = 0;
3009			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3010		}
3011	}
3012
3013	/* check to see if we have rx token to pass back */
3014	if (valid & 0x1)
3015	    *ss->irq_claim = be32toh(3);
3016	*(ss->irq_claim + 1) = be32toh(3);
3017}
3018
3019static void
3020mxge_init(void *arg)
3021{
3022}
3023
3024
3025
3026static void
3027mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3028{
3029	struct lro_entry *lro_entry;
3030	int i;
3031
3032	while (!SLIST_EMPTY(&ss->lro_free)) {
3033		lro_entry = SLIST_FIRST(&ss->lro_free);
3034		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3035		free(lro_entry, M_DEVBUF);
3036	}
3037
3038	for (i = 0; i <= ss->rx_big.mask; i++) {
3039		if (ss->rx_big.info[i].m == NULL)
3040			continue;
3041		bus_dmamap_unload(ss->rx_big.dmat,
3042				  ss->rx_big.info[i].map);
3043		m_freem(ss->rx_big.info[i].m);
3044		ss->rx_big.info[i].m = NULL;
3045	}
3046
3047	for (i = 0; i <= ss->rx_small.mask; i++) {
3048		if (ss->rx_small.info[i].m == NULL)
3049			continue;
3050		bus_dmamap_unload(ss->rx_small.dmat,
3051				  ss->rx_small.info[i].map);
3052		m_freem(ss->rx_small.info[i].m);
3053		ss->rx_small.info[i].m = NULL;
3054	}
3055
3056	/* transmit ring used only on the first slice */
3057	if (ss->tx.info == NULL)
3058		return;
3059
3060	for (i = 0; i <= ss->tx.mask; i++) {
3061		ss->tx.info[i].flag = 0;
3062		if (ss->tx.info[i].m == NULL)
3063			continue;
3064		bus_dmamap_unload(ss->tx.dmat,
3065				  ss->tx.info[i].map);
3066		m_freem(ss->tx.info[i].m);
3067		ss->tx.info[i].m = NULL;
3068	}
3069}
3070
3071static void
3072mxge_free_mbufs(mxge_softc_t *sc)
3073{
3074	int slice;
3075
3076	for (slice = 0; slice < sc->num_slices; slice++)
3077		mxge_free_slice_mbufs(&sc->ss[slice]);
3078}
3079
3080static void
3081mxge_free_slice_rings(struct mxge_slice_state *ss)
3082{
3083	int i;
3084
3085
3086	if (ss->rx_done.entry != NULL)
3087		mxge_dma_free(&ss->rx_done.dma);
3088	ss->rx_done.entry = NULL;
3089
3090	if (ss->tx.req_bytes != NULL)
3091		free(ss->tx.req_bytes, M_DEVBUF);
3092	ss->tx.req_bytes = NULL;
3093
3094	if (ss->tx.seg_list != NULL)
3095		free(ss->tx.seg_list, M_DEVBUF);
3096	ss->tx.seg_list = NULL;
3097
3098	if (ss->rx_small.shadow != NULL)
3099		free(ss->rx_small.shadow, M_DEVBUF);
3100	ss->rx_small.shadow = NULL;
3101
3102	if (ss->rx_big.shadow != NULL)
3103		free(ss->rx_big.shadow, M_DEVBUF);
3104	ss->rx_big.shadow = NULL;
3105
3106	if (ss->tx.info != NULL) {
3107		if (ss->tx.dmat != NULL) {
3108			for (i = 0; i <= ss->tx.mask; i++) {
3109				bus_dmamap_destroy(ss->tx.dmat,
3110						   ss->tx.info[i].map);
3111			}
3112			bus_dma_tag_destroy(ss->tx.dmat);
3113		}
3114		free(ss->tx.info, M_DEVBUF);
3115	}
3116	ss->tx.info = NULL;
3117
3118	if (ss->rx_small.info != NULL) {
3119		if (ss->rx_small.dmat != NULL) {
3120			for (i = 0; i <= ss->rx_small.mask; i++) {
3121				bus_dmamap_destroy(ss->rx_small.dmat,
3122						   ss->rx_small.info[i].map);
3123			}
3124			bus_dmamap_destroy(ss->rx_small.dmat,
3125					   ss->rx_small.extra_map);
3126			bus_dma_tag_destroy(ss->rx_small.dmat);
3127		}
3128		free(ss->rx_small.info, M_DEVBUF);
3129	}
3130	ss->rx_small.info = NULL;
3131
3132	if (ss->rx_big.info != NULL) {
3133		if (ss->rx_big.dmat != NULL) {
3134			for (i = 0; i <= ss->rx_big.mask; i++) {
3135				bus_dmamap_destroy(ss->rx_big.dmat,
3136						   ss->rx_big.info[i].map);
3137			}
3138			bus_dmamap_destroy(ss->rx_big.dmat,
3139					   ss->rx_big.extra_map);
3140			bus_dma_tag_destroy(ss->rx_big.dmat);
3141		}
3142		free(ss->rx_big.info, M_DEVBUF);
3143	}
3144	ss->rx_big.info = NULL;
3145}
3146
3147static void
3148mxge_free_rings(mxge_softc_t *sc)
3149{
3150	int slice;
3151
3152	for (slice = 0; slice < sc->num_slices; slice++)
3153		mxge_free_slice_rings(&sc->ss[slice]);
3154}
3155
3156static int
3157mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3158		       int tx_ring_entries)
3159{
3160	mxge_softc_t *sc = ss->sc;
3161	size_t bytes;
3162	int err, i;
3163
3164	err = ENOMEM;
3165
3166	/* allocate per-slice receive resources */
3167
3168	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3169	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3170
3171	/* allocate the rx shadow rings */
3172	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3173	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3174	if (ss->rx_small.shadow == NULL)
3175		return err;;
3176
3177	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3178	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3179	if (ss->rx_big.shadow == NULL)
3180		return err;;
3181
3182	/* allocate the rx host info rings */
3183	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3184	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3185	if (ss->rx_small.info == NULL)
3186		return err;;
3187
3188	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3189	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3190	if (ss->rx_big.info == NULL)
3191		return err;;
3192
3193	/* allocate the rx busdma resources */
3194	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3195				 1,			/* alignment */
3196				 4096,			/* boundary */
3197				 BUS_SPACE_MAXADDR,	/* low */
3198				 BUS_SPACE_MAXADDR,	/* high */
3199				 NULL, NULL,		/* filter */
3200				 MHLEN,			/* maxsize */
3201				 1,			/* num segs */
3202				 MHLEN,			/* maxsegsize */
3203				 BUS_DMA_ALLOCNOW,	/* flags */
3204				 NULL, NULL,		/* lock */
3205				 &ss->rx_small.dmat);	/* tag */
3206	if (err != 0) {
3207		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3208			      err);
3209		return err;;
3210	}
3211
3212	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3213				 1,			/* alignment */
3214#if MXGE_VIRT_JUMBOS
3215				 4096,			/* boundary */
3216#else
3217				 0,			/* boundary */
3218#endif
3219				 BUS_SPACE_MAXADDR,	/* low */
3220				 BUS_SPACE_MAXADDR,	/* high */
3221				 NULL, NULL,		/* filter */
3222				 3*4096,		/* maxsize */
3223#if MXGE_VIRT_JUMBOS
3224				 3,			/* num segs */
3225				 4096,			/* maxsegsize*/
3226#else
3227				 1,			/* num segs */
3228				 MJUM9BYTES,		/* maxsegsize*/
3229#endif
3230				 BUS_DMA_ALLOCNOW,	/* flags */
3231				 NULL, NULL,		/* lock */
3232				 &ss->rx_big.dmat);	/* tag */
3233	if (err != 0) {
3234		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3235			      err);
3236		return err;;
3237	}
3238	for (i = 0; i <= ss->rx_small.mask; i++) {
3239		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3240					&ss->rx_small.info[i].map);
3241		if (err != 0) {
3242			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3243				      err);
3244			return err;;
3245		}
3246	}
3247	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3248				&ss->rx_small.extra_map);
3249	if (err != 0) {
3250		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3251			      err);
3252		return err;;
3253	}
3254
3255	for (i = 0; i <= ss->rx_big.mask; i++) {
3256		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3257					&ss->rx_big.info[i].map);
3258		if (err != 0) {
3259			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3260				      err);
3261			return err;;
3262		}
3263	}
3264	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3265				&ss->rx_big.extra_map);
3266	if (err != 0) {
3267		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3268			      err);
3269		return err;;
3270	}
3271
3272	/* now allocate TX resouces */
3273
3274#ifndef IFNET_BUF_RING
3275	/* only use a single TX ring for now */
3276	if (ss != ss->sc->ss)
3277		return 0;
3278#endif
3279
3280	ss->tx.mask = tx_ring_entries - 1;
3281	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3282
3283
3284	/* allocate the tx request copy block */
3285	bytes = 8 +
3286		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3287	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3288	if (ss->tx.req_bytes == NULL)
3289		return err;;
3290	/* ensure req_list entries are aligned to 8 bytes */
3291	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3292		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3293
3294	/* allocate the tx busdma segment list */
3295	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3296	ss->tx.seg_list = (bus_dma_segment_t *)
3297		malloc(bytes, M_DEVBUF, M_WAITOK);
3298	if (ss->tx.seg_list == NULL)
3299		return err;;
3300
3301	/* allocate the tx host info ring */
3302	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3303	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3304	if (ss->tx.info == NULL)
3305		return err;;
3306
3307	/* allocate the tx busdma resources */
3308	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3309				 1,			/* alignment */
3310				 sc->tx_boundary,	/* boundary */
3311				 BUS_SPACE_MAXADDR,	/* low */
3312				 BUS_SPACE_MAXADDR,	/* high */
3313				 NULL, NULL,		/* filter */
3314				 65536 + 256,		/* maxsize */
3315				 ss->tx.max_desc - 2,	/* num segs */
3316				 sc->tx_boundary,	/* maxsegsz */
3317				 BUS_DMA_ALLOCNOW,	/* flags */
3318				 NULL, NULL,		/* lock */
3319				 &ss->tx.dmat);		/* tag */
3320
3321	if (err != 0) {
3322		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3323			      err);
3324		return err;;
3325	}
3326
3327	/* now use these tags to setup dmamaps for each slot
3328	   in the ring */
3329	for (i = 0; i <= ss->tx.mask; i++) {
3330		err = bus_dmamap_create(ss->tx.dmat, 0,
3331					&ss->tx.info[i].map);
3332		if (err != 0) {
3333			device_printf(sc->dev, "Err %d  tx dmamap\n",
3334				      err);
3335			return err;;
3336		}
3337	}
3338	return 0;
3339
3340}
3341
3342static int
3343mxge_alloc_rings(mxge_softc_t *sc)
3344{
3345	mxge_cmd_t cmd;
3346	int tx_ring_size;
3347	int tx_ring_entries, rx_ring_entries;
3348	int err, slice;
3349
3350	/* get ring sizes */
3351	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3352	tx_ring_size = cmd.data0;
3353	if (err != 0) {
3354		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3355		goto abort;
3356	}
3357
3358	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3359	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3360	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3361	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3362	IFQ_SET_READY(&sc->ifp->if_snd);
3363
3364	for (slice = 0; slice < sc->num_slices; slice++) {
3365		err = mxge_alloc_slice_rings(&sc->ss[slice],
3366					     rx_ring_entries,
3367					     tx_ring_entries);
3368		if (err != 0)
3369			goto abort;
3370	}
3371	return 0;
3372
3373abort:
3374	mxge_free_rings(sc);
3375	return err;
3376
3377}
3378
3379
3380static void
3381mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3382{
3383	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3384
3385	if (bufsize < MCLBYTES) {
3386		/* easy, everything fits in a single buffer */
3387		*big_buf_size = MCLBYTES;
3388		*cl_size = MCLBYTES;
3389		*nbufs = 1;
3390		return;
3391	}
3392
3393	if (bufsize < MJUMPAGESIZE) {
3394		/* still easy, everything still fits in a single buffer */
3395		*big_buf_size = MJUMPAGESIZE;
3396		*cl_size = MJUMPAGESIZE;
3397		*nbufs = 1;
3398		return;
3399	}
3400#if MXGE_VIRT_JUMBOS
3401	/* now we need to use virtually contiguous buffers */
3402	*cl_size = MJUM9BYTES;
3403	*big_buf_size = 4096;
3404	*nbufs = mtu / 4096 + 1;
3405	/* needs to be a power of two, so round up */
3406	if (*nbufs == 3)
3407		*nbufs = 4;
3408#else
3409	*cl_size = MJUM9BYTES;
3410	*big_buf_size = MJUM9BYTES;
3411	*nbufs = 1;
3412#endif
3413}
3414
3415static int
3416mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3417{
3418	mxge_softc_t *sc;
3419	mxge_cmd_t cmd;
3420	bus_dmamap_t map;
3421	struct lro_entry *lro_entry;
3422	int err, i, slice;
3423
3424
3425	sc = ss->sc;
3426	slice = ss - sc->ss;
3427
3428	SLIST_INIT(&ss->lro_free);
3429	SLIST_INIT(&ss->lro_active);
3430
3431	for (i = 0; i < sc->lro_cnt; i++) {
3432		lro_entry = (struct lro_entry *)
3433			malloc(sizeof (*lro_entry), M_DEVBUF,
3434			       M_NOWAIT | M_ZERO);
3435		if (lro_entry == NULL) {
3436			sc->lro_cnt = i;
3437			break;
3438		}
3439		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3440	}
3441	/* get the lanai pointers to the send and receive rings */
3442
3443	err = 0;
3444#ifndef IFNET_BUF_RING
3445	/* We currently only send from the first slice */
3446	if (slice == 0) {
3447#endif
3448		cmd.data0 = slice;
3449		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3450		ss->tx.lanai =
3451			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3452		ss->tx.send_go = (volatile uint32_t *)
3453			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3454		ss->tx.send_stop = (volatile uint32_t *)
3455		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3456#ifndef IFNET_BUF_RING
3457	}
3458#endif
3459	cmd.data0 = slice;
3460	err |= mxge_send_cmd(sc,
3461			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3462	ss->rx_small.lanai =
3463		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3464	cmd.data0 = slice;
3465	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3466	ss->rx_big.lanai =
3467		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3468
3469	if (err != 0) {
3470		device_printf(sc->dev,
3471			      "failed to get ring sizes or locations\n");
3472		return EIO;
3473	}
3474
3475	/* stock receive rings */
3476	for (i = 0; i <= ss->rx_small.mask; i++) {
3477		map = ss->rx_small.info[i].map;
3478		err = mxge_get_buf_small(ss, map, i);
3479		if (err) {
3480			device_printf(sc->dev, "alloced %d/%d smalls\n",
3481				      i, ss->rx_small.mask + 1);
3482			return ENOMEM;
3483		}
3484	}
3485	for (i = 0; i <= ss->rx_big.mask; i++) {
3486		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3487		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3488	}
3489	ss->rx_big.nbufs = nbufs;
3490	ss->rx_big.cl_size = cl_size;
3491	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3492		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3493	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3494		map = ss->rx_big.info[i].map;
3495		err = mxge_get_buf_big(ss, map, i);
3496		if (err) {
3497			device_printf(sc->dev, "alloced %d/%d bigs\n",
3498				      i, ss->rx_big.mask + 1);
3499			return ENOMEM;
3500		}
3501	}
3502	return 0;
3503}
3504
3505static int
3506mxge_open(mxge_softc_t *sc)
3507{
3508	mxge_cmd_t cmd;
3509	int err, big_bytes, nbufs, slice, cl_size, i;
3510	bus_addr_t bus;
3511	volatile uint8_t *itable;
3512	struct mxge_slice_state *ss;
3513
3514	/* Copy the MAC address in case it was overridden */
3515	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3516
3517	err = mxge_reset(sc, 1);
3518	if (err != 0) {
3519		device_printf(sc->dev, "failed to reset\n");
3520		return EIO;
3521	}
3522
3523	if (sc->num_slices > 1) {
3524		/* setup the indirection table */
3525		cmd.data0 = sc->num_slices;
3526		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3527				    &cmd);
3528
3529		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3530				     &cmd);
3531		if (err != 0) {
3532			device_printf(sc->dev,
3533				      "failed to setup rss tables\n");
3534			return err;
3535		}
3536
3537		/* just enable an identity mapping */
3538		itable = sc->sram + cmd.data0;
3539		for (i = 0; i < sc->num_slices; i++)
3540			itable[i] = (uint8_t)i;
3541
3542		cmd.data0 = 1;
3543		cmd.data1 = mxge_rss_hash_type;
3544		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3545		if (err != 0) {
3546			device_printf(sc->dev, "failed to enable slices\n");
3547			return err;
3548		}
3549	}
3550
3551
3552	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3553
3554	cmd.data0 = nbufs;
3555	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3556			    &cmd);
3557	/* error is only meaningful if we're trying to set
3558	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3559	if (err && nbufs > 1) {
3560		device_printf(sc->dev,
3561			      "Failed to set alway-use-n to %d\n",
3562			      nbufs);
3563		return EIO;
3564	}
3565	/* Give the firmware the mtu and the big and small buffer
3566	   sizes.  The firmware wants the big buf size to be a power
3567	   of two. Luckily, FreeBSD's clusters are powers of two */
3568	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3569	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3570	cmd.data0 = MHLEN - MXGEFW_PAD;
3571	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3572			     &cmd);
3573	cmd.data0 = big_bytes;
3574	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3575
3576	if (err != 0) {
3577		device_printf(sc->dev, "failed to setup params\n");
3578		goto abort;
3579	}
3580
3581	/* Now give him the pointer to the stats block */
3582	for (slice = 0;
3583#ifdef IFNET_BUF_RING
3584	     slice < sc->num_slices;
3585#else
3586	     slice < 1;
3587#endif
3588	     slice++) {
3589		ss = &sc->ss[slice];
3590		cmd.data0 =
3591			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3592		cmd.data1 =
3593			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3594		cmd.data2 = sizeof(struct mcp_irq_data);
3595		cmd.data2 |= (slice << 16);
3596		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3597	}
3598
3599	if (err != 0) {
3600		bus = sc->ss->fw_stats_dma.bus_addr;
3601		bus += offsetof(struct mcp_irq_data, send_done_count);
3602		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3603		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3604		err = mxge_send_cmd(sc,
3605				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3606				    &cmd);
3607		/* Firmware cannot support multicast without STATS_DMA_V2 */
3608		sc->fw_multicast_support = 0;
3609	} else {
3610		sc->fw_multicast_support = 1;
3611	}
3612
3613	if (err != 0) {
3614		device_printf(sc->dev, "failed to setup params\n");
3615		goto abort;
3616	}
3617
3618	for (slice = 0; slice < sc->num_slices; slice++) {
3619		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3620		if (err != 0) {
3621			device_printf(sc->dev, "couldn't open slice %d\n",
3622				      slice);
3623			goto abort;
3624		}
3625	}
3626
3627	/* Finally, start the firmware running */
3628	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3629	if (err) {
3630		device_printf(sc->dev, "Couldn't bring up link\n");
3631		goto abort;
3632	}
3633#ifdef IFNET_BUF_RING
3634	for (slice = 0; slice < sc->num_slices; slice++) {
3635		ss = &sc->ss[slice];
3636		ss->if_drv_flags |= IFF_DRV_RUNNING;
3637		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3638	}
3639#endif
3640	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3641	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3642	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3643
3644	return 0;
3645
3646
3647abort:
3648	mxge_free_mbufs(sc);
3649
3650	return err;
3651}
3652
3653static int
3654mxge_close(mxge_softc_t *sc, int down)
3655{
3656	mxge_cmd_t cmd;
3657	int err, old_down_cnt;
3658#ifdef IFNET_BUF_RING
3659	struct mxge_slice_state *ss;
3660	int slice;
3661#endif
3662
3663	callout_stop(&sc->co_hdl);
3664#ifdef IFNET_BUF_RING
3665	for (slice = 0; slice < sc->num_slices; slice++) {
3666		ss = &sc->ss[slice];
3667		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3668	}
3669#endif
3670	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3671	if (!down) {
3672		old_down_cnt = sc->down_cnt;
3673		wmb();
3674		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3675		if (err) {
3676			device_printf(sc->dev,
3677				      "Couldn't bring down link\n");
3678		}
3679		if (old_down_cnt == sc->down_cnt) {
3680			/* wait for down irq */
3681			DELAY(10 * sc->intr_coal_delay);
3682		}
3683		wmb();
3684		if (old_down_cnt == sc->down_cnt) {
3685			device_printf(sc->dev, "never got down irq\n");
3686		}
3687	}
3688	mxge_free_mbufs(sc);
3689
3690	return 0;
3691}
3692
3693static void
3694mxge_setup_cfg_space(mxge_softc_t *sc)
3695{
3696	device_t dev = sc->dev;
3697	int reg;
3698	uint16_t cmd, lnk, pectl;
3699
3700	/* find the PCIe link width and set max read request to 4KB*/
3701	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3702		lnk = pci_read_config(dev, reg + 0x12, 2);
3703		sc->link_width = (lnk >> 4) & 0x3f;
3704
3705		pectl = pci_read_config(dev, reg + 0x8, 2);
3706		pectl = (pectl & ~0x7000) | (5 << 12);
3707		pci_write_config(dev, reg + 0x8, pectl, 2);
3708	}
3709
3710	/* Enable DMA and Memory space access */
3711	pci_enable_busmaster(dev);
3712	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3713	cmd |= PCIM_CMD_MEMEN;
3714	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3715}
3716
3717static uint32_t
3718mxge_read_reboot(mxge_softc_t *sc)
3719{
3720	device_t dev = sc->dev;
3721	uint32_t vs;
3722
3723	/* find the vendor specific offset */
3724	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3725		device_printf(sc->dev,
3726			      "could not find vendor specific offset\n");
3727		return (uint32_t)-1;
3728	}
3729	/* enable read32 mode */
3730	pci_write_config(dev, vs + 0x10, 0x3, 1);
3731	/* tell NIC which register to read */
3732	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3733	return (pci_read_config(dev, vs + 0x14, 4));
3734}
3735
3736static int
3737mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3738{
3739	struct pci_devinfo *dinfo;
3740	struct mxge_slice_state *ss;
3741	mxge_tx_ring_t *tx;
3742	int err, running, s, num_tx_slices = 1;
3743	uint32_t reboot;
3744	uint16_t cmd;
3745
3746	err = ENXIO;
3747
3748	device_printf(sc->dev, "Watchdog reset!\n");
3749
3750	/*
3751	 * check to see if the NIC rebooted.  If it did, then all of
3752	 * PCI config space has been reset, and things like the
3753	 * busmaster bit will be zero.  If this is the case, then we
3754	 * must restore PCI config space before the NIC can be used
3755	 * again
3756	 */
3757	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3758	if (cmd == 0xffff) {
3759		/*
3760		 * maybe the watchdog caught the NIC rebooting; wait
3761		 * up to 100ms for it to finish.  If it does not come
3762		 * back, then give up
3763		 */
3764		DELAY(1000*100);
3765		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3766		if (cmd == 0xffff) {
3767			device_printf(sc->dev, "NIC disappeared!\n");
3768			return (err);
3769		}
3770	}
3771	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3772		/* print the reboot status */
3773		reboot = mxge_read_reboot(sc);
3774		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3775			      reboot);
3776		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3777		if (running) {
3778
3779			/*
3780			 * quiesce NIC so that TX routines will not try to
3781			 * xmit after restoration of BAR
3782			 */
3783
3784			/* Mark the link as down */
3785			if (sc->link_state) {
3786				sc->link_state = 0;
3787				if_link_state_change(sc->ifp,
3788						     LINK_STATE_DOWN);
3789			}
3790#ifdef IFNET_BUF_RING
3791			num_tx_slices = sc->num_slices;
3792#endif
3793			/* grab all TX locks to ensure no tx  */
3794			for (s = 0; s < num_tx_slices; s++) {
3795				ss = &sc->ss[s];
3796				mtx_lock(&ss->tx.mtx);
3797			}
3798			mxge_close(sc, 1);
3799		}
3800		/* restore PCI configuration space */
3801		dinfo = device_get_ivars(sc->dev);
3802		pci_cfg_restore(sc->dev, dinfo);
3803
3804		/* and redo any changes we made to our config space */
3805		mxge_setup_cfg_space(sc);
3806
3807		/* reload f/w */
3808		err = mxge_load_firmware(sc, 0);
3809		if (err) {
3810			device_printf(sc->dev,
3811				      "Unable to re-load f/w\n");
3812		}
3813		if (running) {
3814			if (!err)
3815				err = mxge_open(sc);
3816			/* release all TX locks */
3817			for (s = 0; s < num_tx_slices; s++) {
3818				ss = &sc->ss[s];
3819				mtx_unlock(&ss->tx.mtx);
3820			}
3821		}
3822		sc->watchdog_resets++;
3823	} else {
3824		tx = &sc->ss[slice].tx;
3825		device_printf(sc->dev,
3826			      "NIC did not reboot, slice %d ring state:\n",
3827			      slice);
3828		device_printf(sc->dev,
3829			      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3830			      tx->req, tx->done, tx->queue_active);
3831		device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3832			      tx->activate, tx->deactivate);
3833		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3834			      tx->pkt_done,
3835			      be32toh(sc->ss->fw_stats->send_done_count));
3836		device_printf(sc->dev, "not resetting\n");
3837	}
3838	if (err)
3839		device_printf(sc->dev, "watchdog reset failed\n");
3840
3841	return (err);
3842}
3843
3844static int
3845mxge_watchdog(mxge_softc_t *sc)
3846{
3847	mxge_tx_ring_t *tx;
3848	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3849	int i, err = 0;
3850
3851	/* see if we have outstanding transmits, which
3852	   have been pending for more than mxge_ticks */
3853	for (i = 0;
3854#ifdef IFNET_BUF_RING
3855	     (i < sc->num_slices) && (err == 0);
3856#else
3857	     (i < 1) && (err == 0);
3858#endif
3859	     i++) {
3860		tx = &sc->ss[i].tx;
3861		if (tx->req != tx->done &&
3862		    tx->watchdog_req != tx->watchdog_done &&
3863		    tx->done == tx->watchdog_done) {
3864			/* check for pause blocking before resetting */
3865			if (tx->watchdog_rx_pause == rx_pause)
3866				err = mxge_watchdog_reset(sc, i);
3867			else
3868				device_printf(sc->dev, "Flow control blocking "
3869					      "xmits, check link partner\n");
3870		}
3871
3872		tx->watchdog_req = tx->req;
3873		tx->watchdog_done = tx->done;
3874		tx->watchdog_rx_pause = rx_pause;
3875	}
3876
3877	if (sc->need_media_probe)
3878		mxge_media_probe(sc);
3879	return (err);
3880}
3881
3882static void
3883mxge_update_stats(mxge_softc_t *sc)
3884{
3885	struct mxge_slice_state *ss;
3886	u_long ipackets = 0;
3887	u_long opackets = 0;
3888#ifdef IFNET_BUF_RING
3889	u_long obytes = 0;
3890	u_long omcasts = 0;
3891	u_long odrops = 0;
3892#endif
3893	u_long oerrors = 0;
3894	int slice;
3895
3896	for (slice = 0; slice < sc->num_slices; slice++) {
3897		ss = &sc->ss[slice];
3898		ipackets += ss->ipackets;
3899		opackets += ss->opackets;
3900#ifdef IFNET_BUF_RING
3901		obytes += ss->obytes;
3902		omcasts += ss->omcasts;
3903		odrops += ss->tx.br->br_drops;
3904#endif
3905		oerrors += ss->oerrors;
3906	}
3907	sc->ifp->if_ipackets = ipackets;
3908	sc->ifp->if_opackets = opackets;
3909#ifdef IFNET_BUF_RING
3910	sc->ifp->if_obytes = obytes;
3911	sc->ifp->if_omcasts = omcasts;
3912	sc->ifp->if_snd.ifq_drops = odrops;
3913#endif
3914	sc->ifp->if_oerrors = oerrors;
3915}
3916
3917static void
3918mxge_tick(void *arg)
3919{
3920	mxge_softc_t *sc = arg;
3921	int err = 0;
3922
3923	/* aggregate stats from different slices */
3924	mxge_update_stats(sc);
3925	if (!sc->watchdog_countdown) {
3926		err = mxge_watchdog(sc);
3927		sc->watchdog_countdown = 4;
3928	}
3929	sc->watchdog_countdown--;
3930	if (err == 0)
3931		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3932
3933}
3934
3935static int
3936mxge_media_change(struct ifnet *ifp)
3937{
3938	return EINVAL;
3939}
3940
3941static int
3942mxge_change_mtu(mxge_softc_t *sc, int mtu)
3943{
3944	struct ifnet *ifp = sc->ifp;
3945	int real_mtu, old_mtu;
3946	int err = 0;
3947
3948
3949	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3950	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3951		return EINVAL;
3952	mtx_lock(&sc->driver_mtx);
3953	old_mtu = ifp->if_mtu;
3954	ifp->if_mtu = mtu;
3955	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3956		mxge_close(sc, 0);
3957		err = mxge_open(sc);
3958		if (err != 0) {
3959			ifp->if_mtu = old_mtu;
3960			mxge_close(sc, 0);
3961			(void) mxge_open(sc);
3962		}
3963	}
3964	mtx_unlock(&sc->driver_mtx);
3965	return err;
3966}
3967
3968static void
3969mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3970{
3971	mxge_softc_t *sc = ifp->if_softc;
3972
3973
3974	if (sc == NULL)
3975		return;
3976	ifmr->ifm_status = IFM_AVALID;
3977	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3978	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3979	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3980}
3981
3982static int
3983mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3984{
3985	mxge_softc_t *sc = ifp->if_softc;
3986	struct ifreq *ifr = (struct ifreq *)data;
3987	int err, mask;
3988
3989	err = 0;
3990	switch (command) {
3991	case SIOCSIFADDR:
3992	case SIOCGIFADDR:
3993		err = ether_ioctl(ifp, command, data);
3994		break;
3995
3996	case SIOCSIFMTU:
3997		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3998		break;
3999
4000	case SIOCSIFFLAGS:
4001		mtx_lock(&sc->driver_mtx);
4002		if (sc->dying) {
4003			mtx_unlock(&sc->driver_mtx);
4004			return EINVAL;
4005		}
4006		if (ifp->if_flags & IFF_UP) {
4007			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4008				err = mxge_open(sc);
4009			} else {
4010				/* take care of promis can allmulti
4011				   flag chages */
4012				mxge_change_promisc(sc,
4013						    ifp->if_flags & IFF_PROMISC);
4014				mxge_set_multicast_list(sc);
4015			}
4016		} else {
4017			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4018				mxge_close(sc, 0);
4019			}
4020		}
4021		mtx_unlock(&sc->driver_mtx);
4022		break;
4023
4024	case SIOCADDMULTI:
4025	case SIOCDELMULTI:
4026		mtx_lock(&sc->driver_mtx);
4027		mxge_set_multicast_list(sc);
4028		mtx_unlock(&sc->driver_mtx);
4029		break;
4030
4031	case SIOCSIFCAP:
4032		mtx_lock(&sc->driver_mtx);
4033		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4034		if (mask & IFCAP_TXCSUM) {
4035			if (IFCAP_TXCSUM & ifp->if_capenable) {
4036				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4037				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4038						      | CSUM_TSO);
4039			} else {
4040				ifp->if_capenable |= IFCAP_TXCSUM;
4041				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4042			}
4043		} else if (mask & IFCAP_RXCSUM) {
4044			if (IFCAP_RXCSUM & ifp->if_capenable) {
4045				ifp->if_capenable &= ~IFCAP_RXCSUM;
4046				sc->csum_flag = 0;
4047			} else {
4048				ifp->if_capenable |= IFCAP_RXCSUM;
4049				sc->csum_flag = 1;
4050			}
4051		}
4052		if (mask & IFCAP_TSO4) {
4053			if (IFCAP_TSO4 & ifp->if_capenable) {
4054				ifp->if_capenable &= ~IFCAP_TSO4;
4055				ifp->if_hwassist &= ~CSUM_TSO;
4056			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4057				ifp->if_capenable |= IFCAP_TSO4;
4058				ifp->if_hwassist |= CSUM_TSO;
4059			} else {
4060				printf("mxge requires tx checksum offload"
4061				       " be enabled to use TSO\n");
4062				err = EINVAL;
4063			}
4064		}
4065		if (mask & IFCAP_LRO) {
4066			if (IFCAP_LRO & ifp->if_capenable)
4067				err = mxge_change_lro_locked(sc, 0);
4068			else
4069				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4070		}
4071		if (mask & IFCAP_VLAN_HWTAGGING)
4072			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4073		mtx_unlock(&sc->driver_mtx);
4074		VLAN_CAPABILITIES(ifp);
4075
4076		break;
4077
4078	case SIOCGIFMEDIA:
4079		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4080				    &sc->media, command);
4081                break;
4082
4083	default:
4084		err = ENOTTY;
4085        }
4086	return err;
4087}
4088
4089static void
4090mxge_fetch_tunables(mxge_softc_t *sc)
4091{
4092
4093	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4094	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4095			  &mxge_flow_control);
4096	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4097			  &mxge_intr_coal_delay);
4098	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4099			  &mxge_nvidia_ecrc_enable);
4100	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4101			  &mxge_force_firmware);
4102	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4103			  &mxge_deassert_wait);
4104	TUNABLE_INT_FETCH("hw.mxge.verbose",
4105			  &mxge_verbose);
4106	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4107	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4108	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4109	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4110	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4111	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4112	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4113	if (sc->lro_cnt != 0)
4114		mxge_lro_cnt = sc->lro_cnt;
4115
4116	if (bootverbose)
4117		mxge_verbose = 1;
4118	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4119		mxge_intr_coal_delay = 30;
4120	if (mxge_ticks == 0)
4121		mxge_ticks = hz / 2;
4122	sc->pause = mxge_flow_control;
4123	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4124	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4125		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4126	}
4127	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4128	    mxge_initial_mtu < ETHER_MIN_LEN)
4129		mxge_initial_mtu = ETHERMTU_JUMBO;
4130
4131	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4132		mxge_throttle = MXGE_MAX_THROTTLE;
4133	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4134		mxge_throttle = MXGE_MIN_THROTTLE;
4135	sc->throttle = mxge_throttle;
4136}
4137
4138
4139static void
4140mxge_free_slices(mxge_softc_t *sc)
4141{
4142	struct mxge_slice_state *ss;
4143	int i;
4144
4145
4146	if (sc->ss == NULL)
4147		return;
4148
4149	for (i = 0; i < sc->num_slices; i++) {
4150		ss = &sc->ss[i];
4151		if (ss->fw_stats != NULL) {
4152			mxge_dma_free(&ss->fw_stats_dma);
4153			ss->fw_stats = NULL;
4154#ifdef IFNET_BUF_RING
4155			if (ss->tx.br != NULL) {
4156				drbr_free(ss->tx.br, M_DEVBUF);
4157				ss->tx.br = NULL;
4158			}
4159#endif
4160			mtx_destroy(&ss->tx.mtx);
4161		}
4162		if (ss->rx_done.entry != NULL) {
4163			mxge_dma_free(&ss->rx_done.dma);
4164			ss->rx_done.entry = NULL;
4165		}
4166	}
4167	free(sc->ss, M_DEVBUF);
4168	sc->ss = NULL;
4169}
4170
4171static int
4172mxge_alloc_slices(mxge_softc_t *sc)
4173{
4174	mxge_cmd_t cmd;
4175	struct mxge_slice_state *ss;
4176	size_t bytes;
4177	int err, i, max_intr_slots;
4178
4179	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4180	if (err != 0) {
4181		device_printf(sc->dev, "Cannot determine rx ring size\n");
4182		return err;
4183	}
4184	sc->rx_ring_size = cmd.data0;
4185	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4186
4187	bytes = sizeof (*sc->ss) * sc->num_slices;
4188	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4189	if (sc->ss == NULL)
4190		return (ENOMEM);
4191	for (i = 0; i < sc->num_slices; i++) {
4192		ss = &sc->ss[i];
4193
4194		ss->sc = sc;
4195
4196		/* allocate per-slice rx interrupt queues */
4197
4198		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4199		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4200		if (err != 0)
4201			goto abort;
4202		ss->rx_done.entry = ss->rx_done.dma.addr;
4203		bzero(ss->rx_done.entry, bytes);
4204
4205		/*
4206		 * allocate the per-slice firmware stats; stats
4207		 * (including tx) are used used only on the first
4208		 * slice for now
4209		 */
4210#ifndef IFNET_BUF_RING
4211		if (i > 0)
4212			continue;
4213#endif
4214
4215		bytes = sizeof (*ss->fw_stats);
4216		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4217				     sizeof (*ss->fw_stats), 64);
4218		if (err != 0)
4219			goto abort;
4220		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4221		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4222			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4223		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4224#ifdef IFNET_BUF_RING
4225		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4226					   &ss->tx.mtx);
4227#endif
4228	}
4229
4230	return (0);
4231
4232abort:
4233	mxge_free_slices(sc);
4234	return (ENOMEM);
4235}
4236
4237static void
4238mxge_slice_probe(mxge_softc_t *sc)
4239{
4240	mxge_cmd_t cmd;
4241	char *old_fw;
4242	int msix_cnt, status, max_intr_slots;
4243
4244	sc->num_slices = 1;
4245	/*
4246	 *  don't enable multiple slices if they are not enabled,
4247	 *  or if this is not an SMP system
4248	 */
4249
4250	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4251		return;
4252
4253	/* see how many MSI-X interrupts are available */
4254	msix_cnt = pci_msix_count(sc->dev);
4255	if (msix_cnt < 2)
4256		return;
4257
4258	/* now load the slice aware firmware see what it supports */
4259	old_fw = sc->fw_name;
4260	if (old_fw == mxge_fw_aligned)
4261		sc->fw_name = mxge_fw_rss_aligned;
4262	else
4263		sc->fw_name = mxge_fw_rss_unaligned;
4264	status = mxge_load_firmware(sc, 0);
4265	if (status != 0) {
4266		device_printf(sc->dev, "Falling back to a single slice\n");
4267		return;
4268	}
4269
4270	/* try to send a reset command to the card to see if it
4271	   is alive */
4272	memset(&cmd, 0, sizeof (cmd));
4273	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4274	if (status != 0) {
4275		device_printf(sc->dev, "failed reset\n");
4276		goto abort_with_fw;
4277	}
4278
4279	/* get rx ring size */
4280	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4281	if (status != 0) {
4282		device_printf(sc->dev, "Cannot determine rx ring size\n");
4283		goto abort_with_fw;
4284	}
4285	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4286
4287	/* tell it the size of the interrupt queues */
4288	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4289	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4290	if (status != 0) {
4291		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4292		goto abort_with_fw;
4293	}
4294
4295	/* ask the maximum number of slices it supports */
4296	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4297	if (status != 0) {
4298		device_printf(sc->dev,
4299			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4300		goto abort_with_fw;
4301	}
4302	sc->num_slices = cmd.data0;
4303	if (sc->num_slices > msix_cnt)
4304		sc->num_slices = msix_cnt;
4305
4306	if (mxge_max_slices == -1) {
4307		/* cap to number of CPUs in system */
4308		if (sc->num_slices > mp_ncpus)
4309			sc->num_slices = mp_ncpus;
4310	} else {
4311		if (sc->num_slices > mxge_max_slices)
4312			sc->num_slices = mxge_max_slices;
4313	}
4314	/* make sure it is a power of two */
4315	while (sc->num_slices & (sc->num_slices - 1))
4316		sc->num_slices--;
4317
4318	if (mxge_verbose)
4319		device_printf(sc->dev, "using %d slices\n",
4320			      sc->num_slices);
4321
4322	return;
4323
4324abort_with_fw:
4325	sc->fw_name = old_fw;
4326	(void) mxge_load_firmware(sc, 0);
4327}
4328
4329static int
4330mxge_add_msix_irqs(mxge_softc_t *sc)
4331{
4332	size_t bytes;
4333	int count, err, i, rid;
4334
4335	rid = PCIR_BAR(2);
4336	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4337						    &rid, RF_ACTIVE);
4338
4339	if (sc->msix_table_res == NULL) {
4340		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4341		return ENXIO;
4342	}
4343
4344	count = sc->num_slices;
4345	err = pci_alloc_msix(sc->dev, &count);
4346	if (err != 0) {
4347		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4348			      "err = %d \n", sc->num_slices, err);
4349		goto abort_with_msix_table;
4350	}
4351	if (count < sc->num_slices) {
4352		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4353			      count, sc->num_slices);
4354		device_printf(sc->dev,
4355			      "Try setting hw.mxge.max_slices to %d\n",
4356			      count);
4357		err = ENOSPC;
4358		goto abort_with_msix;
4359	}
4360	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4361	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4362	if (sc->msix_irq_res == NULL) {
4363		err = ENOMEM;
4364		goto abort_with_msix;
4365	}
4366
4367	for (i = 0; i < sc->num_slices; i++) {
4368		rid = i + 1;
4369		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4370							  SYS_RES_IRQ,
4371							  &rid, RF_ACTIVE);
4372		if (sc->msix_irq_res[i] == NULL) {
4373			device_printf(sc->dev, "couldn't allocate IRQ res"
4374				      " for message %d\n", i);
4375			err = ENXIO;
4376			goto abort_with_res;
4377		}
4378	}
4379
4380	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4381	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4382
4383	for (i = 0; i < sc->num_slices; i++) {
4384		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4385				     INTR_TYPE_NET | INTR_MPSAFE,
4386#if __FreeBSD_version > 700030
4387				     NULL,
4388#endif
4389				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4390		if (err != 0) {
4391			device_printf(sc->dev, "couldn't setup intr for "
4392				      "message %d\n", i);
4393			goto abort_with_intr;
4394		}
4395	}
4396
4397	if (mxge_verbose) {
4398		device_printf(sc->dev, "using %d msix IRQs:",
4399			      sc->num_slices);
4400		for (i = 0; i < sc->num_slices; i++)
4401			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4402		printf("\n");
4403	}
4404	return (0);
4405
4406abort_with_intr:
4407	for (i = 0; i < sc->num_slices; i++) {
4408		if (sc->msix_ih[i] != NULL) {
4409			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4410					  sc->msix_ih[i]);
4411			sc->msix_ih[i] = NULL;
4412		}
4413	}
4414	free(sc->msix_ih, M_DEVBUF);
4415
4416
4417abort_with_res:
4418	for (i = 0; i < sc->num_slices; i++) {
4419		rid = i + 1;
4420		if (sc->msix_irq_res[i] != NULL)
4421			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4422					     sc->msix_irq_res[i]);
4423		sc->msix_irq_res[i] = NULL;
4424	}
4425	free(sc->msix_irq_res, M_DEVBUF);
4426
4427
4428abort_with_msix:
4429	pci_release_msi(sc->dev);
4430
4431abort_with_msix_table:
4432	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4433			     sc->msix_table_res);
4434
4435	return err;
4436}
4437
4438static int
4439mxge_add_single_irq(mxge_softc_t *sc)
4440{
4441	int count, err, rid;
4442
4443	count = pci_msi_count(sc->dev);
4444	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4445		rid = 1;
4446	} else {
4447		rid = 0;
4448		sc->legacy_irq = 1;
4449	}
4450	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4451					 1, RF_SHAREABLE | RF_ACTIVE);
4452	if (sc->irq_res == NULL) {
4453		device_printf(sc->dev, "could not alloc interrupt\n");
4454		return ENXIO;
4455	}
4456	if (mxge_verbose)
4457		device_printf(sc->dev, "using %s irq %ld\n",
4458			      sc->legacy_irq ? "INTx" : "MSI",
4459			      rman_get_start(sc->irq_res));
4460	err = bus_setup_intr(sc->dev, sc->irq_res,
4461			     INTR_TYPE_NET | INTR_MPSAFE,
4462#if __FreeBSD_version > 700030
4463			     NULL,
4464#endif
4465			     mxge_intr, &sc->ss[0], &sc->ih);
4466	if (err != 0) {
4467		bus_release_resource(sc->dev, SYS_RES_IRQ,
4468				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4469		if (!sc->legacy_irq)
4470			pci_release_msi(sc->dev);
4471	}
4472	return err;
4473}
4474
4475static void
4476mxge_rem_msix_irqs(mxge_softc_t *sc)
4477{
4478	int i, rid;
4479
4480	for (i = 0; i < sc->num_slices; i++) {
4481		if (sc->msix_ih[i] != NULL) {
4482			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4483					  sc->msix_ih[i]);
4484			sc->msix_ih[i] = NULL;
4485		}
4486	}
4487	free(sc->msix_ih, M_DEVBUF);
4488
4489	for (i = 0; i < sc->num_slices; i++) {
4490		rid = i + 1;
4491		if (sc->msix_irq_res[i] != NULL)
4492			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4493					     sc->msix_irq_res[i]);
4494		sc->msix_irq_res[i] = NULL;
4495	}
4496	free(sc->msix_irq_res, M_DEVBUF);
4497
4498	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4499			     sc->msix_table_res);
4500
4501	pci_release_msi(sc->dev);
4502	return;
4503}
4504
4505static void
4506mxge_rem_single_irq(mxge_softc_t *sc)
4507{
4508	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4509	bus_release_resource(sc->dev, SYS_RES_IRQ,
4510			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4511	if (!sc->legacy_irq)
4512		pci_release_msi(sc->dev);
4513}
4514
4515static void
4516mxge_rem_irq(mxge_softc_t *sc)
4517{
4518	if (sc->num_slices > 1)
4519		mxge_rem_msix_irqs(sc);
4520	else
4521		mxge_rem_single_irq(sc);
4522}
4523
4524static int
4525mxge_add_irq(mxge_softc_t *sc)
4526{
4527	int err;
4528
4529	if (sc->num_slices > 1)
4530		err = mxge_add_msix_irqs(sc);
4531	else
4532		err = mxge_add_single_irq(sc);
4533
4534	if (0 && err == 0 && sc->num_slices > 1) {
4535		mxge_rem_msix_irqs(sc);
4536		err = mxge_add_msix_irqs(sc);
4537	}
4538	return err;
4539}
4540
4541
4542static int
4543mxge_attach(device_t dev)
4544{
4545	mxge_softc_t *sc = device_get_softc(dev);
4546	struct ifnet *ifp;
4547	int err, rid;
4548
4549	sc->dev = dev;
4550	mxge_fetch_tunables(sc);
4551
4552	err = bus_dma_tag_create(NULL,			/* parent */
4553				 1,			/* alignment */
4554				 0,			/* boundary */
4555				 BUS_SPACE_MAXADDR,	/* low */
4556				 BUS_SPACE_MAXADDR,	/* high */
4557				 NULL, NULL,		/* filter */
4558				 65536 + 256,		/* maxsize */
4559				 MXGE_MAX_SEND_DESC, 	/* num segs */
4560				 65536,			/* maxsegsize */
4561				 0,			/* flags */
4562				 NULL, NULL,		/* lock */
4563				 &sc->parent_dmat);	/* tag */
4564
4565	if (err != 0) {
4566		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4567			      err);
4568		goto abort_with_nothing;
4569	}
4570
4571	ifp = sc->ifp = if_alloc(IFT_ETHER);
4572	if (ifp == NULL) {
4573		device_printf(dev, "can not if_alloc()\n");
4574		err = ENOSPC;
4575		goto abort_with_parent_dmat;
4576	}
4577	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4578
4579	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4580		 device_get_nameunit(dev));
4581	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4582	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4583		 "%s:drv", device_get_nameunit(dev));
4584	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4585		 MTX_NETWORK_LOCK, MTX_DEF);
4586
4587	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4588
4589	mxge_setup_cfg_space(sc);
4590
4591	/* Map the board into the kernel */
4592	rid = PCIR_BARS;
4593	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4594					 ~0, 1, RF_ACTIVE);
4595	if (sc->mem_res == NULL) {
4596		device_printf(dev, "could not map memory\n");
4597		err = ENXIO;
4598		goto abort_with_lock;
4599	}
4600	sc->sram = rman_get_virtual(sc->mem_res);
4601	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4602	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4603		device_printf(dev, "impossible memory region size %ld\n",
4604			      rman_get_size(sc->mem_res));
4605		err = ENXIO;
4606		goto abort_with_mem_res;
4607	}
4608
4609	/* make NULL terminated copy of the EEPROM strings section of
4610	   lanai SRAM */
4611	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4612	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4613				rman_get_bushandle(sc->mem_res),
4614				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4615				sc->eeprom_strings,
4616				MXGE_EEPROM_STRINGS_SIZE - 2);
4617	err = mxge_parse_strings(sc);
4618	if (err != 0)
4619		goto abort_with_mem_res;
4620
4621	/* Enable write combining for efficient use of PCIe bus */
4622	mxge_enable_wc(sc);
4623
4624	/* Allocate the out of band dma memory */
4625	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4626			     sizeof (mxge_cmd_t), 64);
4627	if (err != 0)
4628		goto abort_with_mem_res;
4629	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4630	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4631	if (err != 0)
4632		goto abort_with_cmd_dma;
4633
4634	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4635	if (err != 0)
4636		goto abort_with_zeropad_dma;
4637
4638	/* select & load the firmware */
4639	err = mxge_select_firmware(sc);
4640	if (err != 0)
4641		goto abort_with_dmabench;
4642	sc->intr_coal_delay = mxge_intr_coal_delay;
4643
4644	mxge_slice_probe(sc);
4645	err = mxge_alloc_slices(sc);
4646	if (err != 0)
4647		goto abort_with_dmabench;
4648
4649	err = mxge_reset(sc, 0);
4650	if (err != 0)
4651		goto abort_with_slices;
4652
4653	err = mxge_alloc_rings(sc);
4654	if (err != 0) {
4655		device_printf(sc->dev, "failed to allocate rings\n");
4656		goto abort_with_dmabench;
4657	}
4658
4659	err = mxge_add_irq(sc);
4660	if (err != 0) {
4661		device_printf(sc->dev, "failed to add irq\n");
4662		goto abort_with_rings;
4663	}
4664
4665	ifp->if_baudrate = IF_Gbps(10UL);
4666	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4667		IFCAP_VLAN_MTU;
4668#ifdef INET
4669	ifp->if_capabilities |= IFCAP_LRO;
4670#endif
4671
4672#ifdef MXGE_NEW_VLAN_API
4673	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4674#endif
4675
4676	sc->max_mtu = mxge_max_mtu(sc);
4677	if (sc->max_mtu >= 9000)
4678		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4679	else
4680		device_printf(dev, "MTU limited to %d.  Install "
4681			      "latest firmware for 9000 byte jumbo support\n",
4682			      sc->max_mtu - ETHER_HDR_LEN);
4683	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4684	ifp->if_capenable = ifp->if_capabilities;
4685	if (sc->lro_cnt == 0)
4686		ifp->if_capenable &= ~IFCAP_LRO;
4687	sc->csum_flag = 1;
4688        ifp->if_init = mxge_init;
4689        ifp->if_softc = sc;
4690        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4691        ifp->if_ioctl = mxge_ioctl;
4692        ifp->if_start = mxge_start;
4693	/* Initialise the ifmedia structure */
4694	ifmedia_init(&sc->media, 0, mxge_media_change,
4695		     mxge_media_status);
4696	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4697	mxge_media_probe(sc);
4698	sc->dying = 0;
4699	ether_ifattach(ifp, sc->mac_addr);
4700	/* ether_ifattach sets mtu to ETHERMTU */
4701	if (mxge_initial_mtu != ETHERMTU)
4702		mxge_change_mtu(sc, mxge_initial_mtu);
4703
4704	mxge_add_sysctls(sc);
4705#ifdef IFNET_BUF_RING
4706	ifp->if_transmit = mxge_transmit;
4707	ifp->if_qflush = mxge_qflush;
4708#endif
4709	return 0;
4710
4711abort_with_rings:
4712	mxge_free_rings(sc);
4713abort_with_slices:
4714	mxge_free_slices(sc);
4715abort_with_dmabench:
4716	mxge_dma_free(&sc->dmabench_dma);
4717abort_with_zeropad_dma:
4718	mxge_dma_free(&sc->zeropad_dma);
4719abort_with_cmd_dma:
4720	mxge_dma_free(&sc->cmd_dma);
4721abort_with_mem_res:
4722	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4723abort_with_lock:
4724	pci_disable_busmaster(dev);
4725	mtx_destroy(&sc->cmd_mtx);
4726	mtx_destroy(&sc->driver_mtx);
4727	if_free(ifp);
4728abort_with_parent_dmat:
4729	bus_dma_tag_destroy(sc->parent_dmat);
4730
4731abort_with_nothing:
4732	return err;
4733}
4734
4735static int
4736mxge_detach(device_t dev)
4737{
4738	mxge_softc_t *sc = device_get_softc(dev);
4739
4740	if (mxge_vlans_active(sc)) {
4741		device_printf(sc->dev,
4742			      "Detach vlans before removing module\n");
4743		return EBUSY;
4744	}
4745	mtx_lock(&sc->driver_mtx);
4746	sc->dying = 1;
4747	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4748		mxge_close(sc, 0);
4749	mtx_unlock(&sc->driver_mtx);
4750	ether_ifdetach(sc->ifp);
4751	callout_drain(&sc->co_hdl);
4752	ifmedia_removeall(&sc->media);
4753	mxge_dummy_rdma(sc, 0);
4754	mxge_rem_sysctls(sc);
4755	mxge_rem_irq(sc);
4756	mxge_free_rings(sc);
4757	mxge_free_slices(sc);
4758	mxge_dma_free(&sc->dmabench_dma);
4759	mxge_dma_free(&sc->zeropad_dma);
4760	mxge_dma_free(&sc->cmd_dma);
4761	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4762	pci_disable_busmaster(dev);
4763	mtx_destroy(&sc->cmd_mtx);
4764	mtx_destroy(&sc->driver_mtx);
4765	if_free(sc->ifp);
4766	bus_dma_tag_destroy(sc->parent_dmat);
4767	return 0;
4768}
4769
4770static int
4771mxge_shutdown(device_t dev)
4772{
4773	return 0;
4774}
4775
4776/*
4777  This file uses Myri10GE driver indentation.
4778
4779  Local Variables:
4780  c-file-style:"linux"
4781  tab-width:8
4782  End:
4783*/
4784