if_mxge.c revision 241037
1/******************************************************************************
2
3Copyright (c) 2006-2009, Myricom Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Neither the name of the Myricom Inc, nor the names of its
13    contributors may be used to endorse or promote products derived from
14    this software without specific prior written permission.
15
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26POSSIBILITY OF SUCH DAMAGE.
27
28***************************************************************************/
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/dev/mxge/if_mxge.c 241037 2012-09-28 18:28:27Z glebius $");
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/linker.h>
36#include <sys/firmware.h>
37#include <sys/endian.h>
38#include <sys/sockio.h>
39#include <sys/mbuf.h>
40#include <sys/malloc.h>
41#include <sys/kdb.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/module.h>
45#include <sys/socket.h>
46#include <sys/sysctl.h>
47#include <sys/sx.h>
48#include <sys/taskqueue.h>
49
50#include <net/if.h>
51#include <net/if_arp.h>
52#include <net/ethernet.h>
53#include <net/if_dl.h>
54#include <net/if_media.h>
55
56#include <net/bpf.h>
57
58#include <net/if_types.h>
59#include <net/if_vlan_var.h>
60#include <net/zlib.h>
61
62#include <netinet/in_systm.h>
63#include <netinet/in.h>
64#include <netinet/ip.h>
65#include <netinet/tcp.h>
66
67#include <machine/bus.h>
68#include <machine/in_cksum.h>
69#include <machine/resource.h>
70#include <sys/bus.h>
71#include <sys/rman.h>
72#include <sys/smp.h>
73
74#include <dev/pci/pcireg.h>
75#include <dev/pci/pcivar.h>
76#include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
77
78#include <vm/vm.h>		/* for pmap_mapdev() */
79#include <vm/pmap.h>
80
81#if defined(__i386) || defined(__amd64)
82#include <machine/specialreg.h>
83#endif
84
85#include <dev/mxge/mxge_mcp.h>
86#include <dev/mxge/mcp_gen_header.h>
87/*#define MXGE_FAKE_IFP*/
88#include <dev/mxge/if_mxge_var.h>
89#ifdef IFNET_BUF_RING
90#include <sys/buf_ring.h>
91#endif
92
93#include "opt_inet.h"
94
95/* tunable params */
96static int mxge_nvidia_ecrc_enable = 1;
97static int mxge_force_firmware = 0;
98static int mxge_intr_coal_delay = 30;
99static int mxge_deassert_wait = 1;
100static int mxge_flow_control = 1;
101static int mxge_verbose = 0;
102static int mxge_lro_cnt = 8;
103static int mxge_ticks;
104static int mxge_max_slices = 1;
105static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
106static int mxge_always_promisc = 0;
107static int mxge_initial_mtu = ETHERMTU_JUMBO;
108static int mxge_throttle = 0;
109static char *mxge_fw_unaligned = "mxge_ethp_z8e";
110static char *mxge_fw_aligned = "mxge_eth_z8e";
111static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
112static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
113
114static int mxge_probe(device_t dev);
115static int mxge_attach(device_t dev);
116static int mxge_detach(device_t dev);
117static int mxge_shutdown(device_t dev);
118static void mxge_intr(void *arg);
119
120static device_method_t mxge_methods[] =
121{
122  /* Device interface */
123  DEVMETHOD(device_probe, mxge_probe),
124  DEVMETHOD(device_attach, mxge_attach),
125  DEVMETHOD(device_detach, mxge_detach),
126  DEVMETHOD(device_shutdown, mxge_shutdown),
127  {0, 0}
128};
129
130static driver_t mxge_driver =
131{
132  "mxge",
133  mxge_methods,
134  sizeof(mxge_softc_t),
135};
136
137static devclass_t mxge_devclass;
138
139/* Declare ourselves to be a child of the PCI bus.*/
140DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
141MODULE_DEPEND(mxge, firmware, 1, 1, 1);
142MODULE_DEPEND(mxge, zlib, 1, 1, 1);
143
144static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
145static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
146static int mxge_close(mxge_softc_t *sc, int down);
147static int mxge_open(mxge_softc_t *sc);
148static void mxge_tick(void *arg);
149
150static int
151mxge_probe(device_t dev)
152{
153	int rev;
154
155
156	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
157	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
158	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
159		rev = pci_get_revid(dev);
160		switch (rev) {
161		case MXGE_PCI_REV_Z8E:
162			device_set_desc(dev, "Myri10G-PCIE-8A");
163			break;
164		case MXGE_PCI_REV_Z8ES:
165			device_set_desc(dev, "Myri10G-PCIE-8B");
166			break;
167		default:
168			device_set_desc(dev, "Myri10G-PCIE-8??");
169			device_printf(dev, "Unrecognized rev %d NIC\n",
170				      rev);
171			break;
172		}
173		return 0;
174	}
175	return ENXIO;
176}
177
178static void
179mxge_enable_wc(mxge_softc_t *sc)
180{
181#if defined(__i386) || defined(__amd64)
182	vm_offset_t len;
183	int err;
184
185	sc->wc = 1;
186	len = rman_get_size(sc->mem_res);
187	err = pmap_change_attr((vm_offset_t) sc->sram,
188			       len, PAT_WRITE_COMBINING);
189	if (err != 0) {
190		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
191			      err);
192		sc->wc = 0;
193	}
194#endif
195}
196
197
198/* callback to get our DMA address */
199static void
200mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
201			 int error)
202{
203	if (error == 0) {
204		*(bus_addr_t *) arg = segs->ds_addr;
205	}
206}
207
208static int
209mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
210		   bus_size_t alignment)
211{
212	int err;
213	device_t dev = sc->dev;
214	bus_size_t boundary, maxsegsize;
215
216	if (bytes > 4096 && alignment == 4096) {
217		boundary = 0;
218		maxsegsize = bytes;
219	} else {
220		boundary = 4096;
221		maxsegsize = 4096;
222	}
223
224	/* allocate DMAable memory tags */
225	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
226				 alignment,		/* alignment */
227				 boundary,		/* boundary */
228				 BUS_SPACE_MAXADDR,	/* low */
229				 BUS_SPACE_MAXADDR,	/* high */
230				 NULL, NULL,		/* filter */
231				 bytes,			/* maxsize */
232				 1,			/* num segs */
233				 maxsegsize,		/* maxsegsize */
234				 BUS_DMA_COHERENT,	/* flags */
235				 NULL, NULL,		/* lock */
236				 &dma->dmat);		/* tag */
237	if (err != 0) {
238		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
239		return err;
240	}
241
242	/* allocate DMAable memory & map */
243	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
244			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
245				| BUS_DMA_ZERO),  &dma->map);
246	if (err != 0) {
247		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
248		goto abort_with_dmat;
249	}
250
251	/* load the memory */
252	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
253			      mxge_dmamap_callback,
254			      (void *)&dma->bus_addr, 0);
255	if (err != 0) {
256		device_printf(dev, "couldn't load map (err = %d)\n", err);
257		goto abort_with_mem;
258	}
259	return 0;
260
261abort_with_mem:
262	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
263abort_with_dmat:
264	(void)bus_dma_tag_destroy(dma->dmat);
265	return err;
266}
267
268
269static void
270mxge_dma_free(mxge_dma_t *dma)
271{
272	bus_dmamap_unload(dma->dmat, dma->map);
273	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
274	(void)bus_dma_tag_destroy(dma->dmat);
275}
276
277/*
278 * The eeprom strings on the lanaiX have the format
279 * SN=x\0
280 * MAC=x:x:x:x:x:x\0
281 * PC=text\0
282 */
283
284static int
285mxge_parse_strings(mxge_softc_t *sc)
286{
287#define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
288
289	char *ptr, *limit;
290	int i, found_mac;
291
292	ptr = sc->eeprom_strings;
293	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
294	found_mac = 0;
295	while (ptr < limit && *ptr != '\0') {
296		if (memcmp(ptr, "MAC=", 4) == 0) {
297			ptr += 1;
298			sc->mac_addr_string = ptr;
299			for (i = 0; i < 6; i++) {
300				ptr += 3;
301				if ((ptr + 2) > limit)
302					goto abort;
303				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
304				found_mac = 1;
305			}
306		} else if (memcmp(ptr, "PC=", 3) == 0) {
307			ptr += 3;
308			strncpy(sc->product_code_string, ptr,
309				sizeof (sc->product_code_string) - 1);
310		} else if (memcmp(ptr, "SN=", 3) == 0) {
311			ptr += 3;
312			strncpy(sc->serial_number_string, ptr,
313				sizeof (sc->serial_number_string) - 1);
314		}
315		MXGE_NEXT_STRING(ptr);
316	}
317
318	if (found_mac)
319		return 0;
320
321 abort:
322	device_printf(sc->dev, "failed to parse eeprom_strings\n");
323
324	return ENXIO;
325}
326
327#if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
328static void
329mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
330{
331	uint32_t val;
332	unsigned long base, off;
333	char *va, *cfgptr;
334	device_t pdev, mcp55;
335	uint16_t vendor_id, device_id, word;
336	uintptr_t bus, slot, func, ivend, idev;
337	uint32_t *ptr32;
338
339
340	if (!mxge_nvidia_ecrc_enable)
341		return;
342
343	pdev = device_get_parent(device_get_parent(sc->dev));
344	if (pdev == NULL) {
345		device_printf(sc->dev, "could not find parent?\n");
346		return;
347	}
348	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
349	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
350
351	if (vendor_id != 0x10de)
352		return;
353
354	base = 0;
355
356	if (device_id == 0x005d) {
357		/* ck804, base address is magic */
358		base = 0xe0000000UL;
359	} else if (device_id >= 0x0374 && device_id <= 0x378) {
360		/* mcp55, base address stored in chipset */
361		mcp55 = pci_find_bsf(0, 0, 0);
362		if (mcp55 &&
363		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
364		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
365			word = pci_read_config(mcp55, 0x90, 2);
366			base = ((unsigned long)word & 0x7ffeU) << 25;
367		}
368	}
369	if (!base)
370		return;
371
372	/* XXXX
373	   Test below is commented because it is believed that doing
374	   config read/write beyond 0xff will access the config space
375	   for the next larger function.  Uncomment this and remove
376	   the hacky pmap_mapdev() way of accessing config space when
377	   FreeBSD grows support for extended pcie config space access
378	*/
379#if 0
380	/* See if we can, by some miracle, access the extended
381	   config space */
382	val = pci_read_config(pdev, 0x178, 4);
383	if (val != 0xffffffff) {
384		val |= 0x40;
385		pci_write_config(pdev, 0x178, val, 4);
386		return;
387	}
388#endif
389	/* Rather than using normal pci config space writes, we must
390	 * map the Nvidia config space ourselves.  This is because on
391	 * opteron/nvidia class machine the 0xe000000 mapping is
392	 * handled by the nvidia chipset, that means the internal PCI
393	 * device (the on-chip northbridge), or the amd-8131 bridge
394	 * and things behind them are not visible by this method.
395	 */
396
397	BUS_READ_IVAR(device_get_parent(pdev), pdev,
398		      PCI_IVAR_BUS, &bus);
399	BUS_READ_IVAR(device_get_parent(pdev), pdev,
400		      PCI_IVAR_SLOT, &slot);
401	BUS_READ_IVAR(device_get_parent(pdev), pdev,
402		      PCI_IVAR_FUNCTION, &func);
403	BUS_READ_IVAR(device_get_parent(pdev), pdev,
404		      PCI_IVAR_VENDOR, &ivend);
405	BUS_READ_IVAR(device_get_parent(pdev), pdev,
406		      PCI_IVAR_DEVICE, &idev);
407
408	off =  base
409		+ 0x00100000UL * (unsigned long)bus
410		+ 0x00001000UL * (unsigned long)(func
411						 + 8 * slot);
412
413	/* map it into the kernel */
414	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
415
416
417	if (va == NULL) {
418		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
419		return;
420	}
421	/* get a pointer to the config space mapped into the kernel */
422	cfgptr = va + (off & PAGE_MASK);
423
424	/* make sure that we can really access it */
425	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
426	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
427	if (! (vendor_id == ivend && device_id == idev)) {
428		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
429			      vendor_id, device_id);
430		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
431		return;
432	}
433
434	ptr32 = (uint32_t*)(cfgptr + 0x178);
435	val = *ptr32;
436
437	if (val == 0xffffffff) {
438		device_printf(sc->dev, "extended mapping failed\n");
439		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
440		return;
441	}
442	*ptr32 = val | 0x40;
443	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
444	if (mxge_verbose)
445		device_printf(sc->dev,
446			      "Enabled ECRC on upstream Nvidia bridge "
447			      "at %d:%d:%d\n",
448			      (int)bus, (int)slot, (int)func);
449	return;
450}
451#else
452static void
453mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
454{
455	device_printf(sc->dev,
456		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
457	return;
458}
459#endif
460
461
462static int
463mxge_dma_test(mxge_softc_t *sc, int test_type)
464{
465	mxge_cmd_t cmd;
466	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
467	int status;
468	uint32_t len;
469	char *test = " ";
470
471
472	/* Run a small DMA test.
473	 * The magic multipliers to the length tell the firmware
474	 * to do DMA read, write, or read+write tests.  The
475	 * results are returned in cmd.data0.  The upper 16
476	 * bits of the return is the number of transfers completed.
477	 * The lower 16 bits is the time in 0.5us ticks that the
478	 * transfers took to complete.
479	 */
480
481	len = sc->tx_boundary;
482
483	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
484	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
485	cmd.data2 = len * 0x10000;
486	status = mxge_send_cmd(sc, test_type, &cmd);
487	if (status != 0) {
488		test = "read";
489		goto abort;
490	}
491	sc->read_dma = ((cmd.data0>>16) * len * 2) /
492		(cmd.data0 & 0xffff);
493	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
494	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
495	cmd.data2 = len * 0x1;
496	status = mxge_send_cmd(sc, test_type, &cmd);
497	if (status != 0) {
498		test = "write";
499		goto abort;
500	}
501	sc->write_dma = ((cmd.data0>>16) * len * 2) /
502		(cmd.data0 & 0xffff);
503
504	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
505	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
506	cmd.data2 = len * 0x10001;
507	status = mxge_send_cmd(sc, test_type, &cmd);
508	if (status != 0) {
509		test = "read/write";
510		goto abort;
511	}
512	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
513		(cmd.data0 & 0xffff);
514
515abort:
516	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
517		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
518			      test, status);
519
520	return status;
521}
522
523/*
524 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
525 * when the PCI-E Completion packets are aligned on an 8-byte
526 * boundary.  Some PCI-E chip sets always align Completion packets; on
527 * the ones that do not, the alignment can be enforced by enabling
528 * ECRC generation (if supported).
529 *
530 * When PCI-E Completion packets are not aligned, it is actually more
531 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
532 *
533 * If the driver can neither enable ECRC nor verify that it has
534 * already been enabled, then it must use a firmware image which works
535 * around unaligned completion packets (ethp_z8e.dat), and it should
536 * also ensure that it never gives the device a Read-DMA which is
537 * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
538 * enabled, then the driver should use the aligned (eth_z8e.dat)
539 * firmware image, and set tx_boundary to 4KB.
540 */
541
542static int
543mxge_firmware_probe(mxge_softc_t *sc)
544{
545	device_t dev = sc->dev;
546	int reg, status;
547	uint16_t pectl;
548
549	sc->tx_boundary = 4096;
550	/*
551	 * Verify the max read request size was set to 4KB
552	 * before trying the test with 4KB.
553	 */
554	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
555		pectl = pci_read_config(dev, reg + 0x8, 2);
556		if ((pectl & (5 << 12)) != (5 << 12)) {
557			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
558				      pectl);
559			sc->tx_boundary = 2048;
560		}
561	}
562
563	/*
564	 * load the optimized firmware (which assumes aligned PCIe
565	 * completions) in order to see if it works on this host.
566	 */
567	sc->fw_name = mxge_fw_aligned;
568	status = mxge_load_firmware(sc, 1);
569	if (status != 0) {
570		return status;
571	}
572
573	/*
574	 * Enable ECRC if possible
575	 */
576	mxge_enable_nvidia_ecrc(sc);
577
578	/*
579	 * Run a DMA test which watches for unaligned completions and
580	 * aborts on the first one seen.
581	 */
582
583	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
584	if (status == 0)
585		return 0; /* keep the aligned firmware */
586
587	if (status != E2BIG)
588		device_printf(dev, "DMA test failed: %d\n", status);
589	if (status == ENOSYS)
590		device_printf(dev, "Falling back to ethp! "
591			      "Please install up to date fw\n");
592	return status;
593}
594
595static int
596mxge_select_firmware(mxge_softc_t *sc)
597{
598	int aligned = 0;
599	int force_firmware = mxge_force_firmware;
600
601	if (sc->throttle)
602		force_firmware = sc->throttle;
603
604	if (force_firmware != 0) {
605		if (force_firmware == 1)
606			aligned = 1;
607		else
608			aligned = 0;
609		if (mxge_verbose)
610			device_printf(sc->dev,
611				      "Assuming %s completions (forced)\n",
612				      aligned ? "aligned" : "unaligned");
613		goto abort;
614	}
615
616	/* if the PCIe link width is 4 or less, we can use the aligned
617	   firmware and skip any checks */
618	if (sc->link_width != 0 && sc->link_width <= 4) {
619		device_printf(sc->dev,
620			      "PCIe x%d Link, expect reduced performance\n",
621			      sc->link_width);
622		aligned = 1;
623		goto abort;
624	}
625
626	if (0 == mxge_firmware_probe(sc))
627		return 0;
628
629abort:
630	if (aligned) {
631		sc->fw_name = mxge_fw_aligned;
632		sc->tx_boundary = 4096;
633	} else {
634		sc->fw_name = mxge_fw_unaligned;
635		sc->tx_boundary = 2048;
636	}
637	return (mxge_load_firmware(sc, 0));
638}
639
640union qualhack
641{
642        const char *ro_char;
643        char *rw_char;
644};
645
646static int
647mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
648{
649
650
651	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
652		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
653			      be32toh(hdr->mcp_type));
654		return EIO;
655	}
656
657	/* save firmware version for sysctl */
658	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
659	if (mxge_verbose)
660		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
661
662	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
663	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
664
665	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
666	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
667		device_printf(sc->dev, "Found firmware version %s\n",
668			      sc->fw_version);
669		device_printf(sc->dev, "Driver needs %d.%d\n",
670			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
671		return EINVAL;
672	}
673	return 0;
674
675}
676
677static void *
678z_alloc(void *nil, u_int items, u_int size)
679{
680        void *ptr;
681
682        ptr = malloc(items * size, M_TEMP, M_NOWAIT);
683        return ptr;
684}
685
686static void
687z_free(void *nil, void *ptr)
688{
689        free(ptr, M_TEMP);
690}
691
692
693static int
694mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
695{
696	z_stream zs;
697	char *inflate_buffer;
698	const struct firmware *fw;
699	const mcp_gen_header_t *hdr;
700	unsigned hdr_offset;
701	int status;
702	unsigned int i;
703	char dummy;
704	size_t fw_len;
705
706	fw = firmware_get(sc->fw_name);
707	if (fw == NULL) {
708		device_printf(sc->dev, "Could not find firmware image %s\n",
709			      sc->fw_name);
710		return ENOENT;
711	}
712
713
714
715	/* setup zlib and decompress f/w */
716	bzero(&zs, sizeof (zs));
717	zs.zalloc = z_alloc;
718	zs.zfree = z_free;
719	status = inflateInit(&zs);
720	if (status != Z_OK) {
721		status = EIO;
722		goto abort_with_fw;
723	}
724
725	/* the uncompressed size is stored as the firmware version,
726	   which would otherwise go unused */
727	fw_len = (size_t) fw->version;
728	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
729	if (inflate_buffer == NULL)
730		goto abort_with_zs;
731	zs.avail_in = fw->datasize;
732	zs.next_in = __DECONST(char *, fw->data);
733	zs.avail_out = fw_len;
734	zs.next_out = inflate_buffer;
735	status = inflate(&zs, Z_FINISH);
736	if (status != Z_STREAM_END) {
737		device_printf(sc->dev, "zlib %d\n", status);
738		status = EIO;
739		goto abort_with_buffer;
740	}
741
742	/* check id */
743	hdr_offset = htobe32(*(const uint32_t *)
744			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
745	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
746		device_printf(sc->dev, "Bad firmware file");
747		status = EIO;
748		goto abort_with_buffer;
749	}
750	hdr = (const void*)(inflate_buffer + hdr_offset);
751
752	status = mxge_validate_firmware(sc, hdr);
753	if (status != 0)
754		goto abort_with_buffer;
755
756	/* Copy the inflated firmware to NIC SRAM. */
757	for (i = 0; i < fw_len; i += 256) {
758		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
759			      inflate_buffer + i,
760			      min(256U, (unsigned)(fw_len - i)));
761		wmb();
762		dummy = *sc->sram;
763		wmb();
764	}
765
766	*limit = fw_len;
767	status = 0;
768abort_with_buffer:
769	free(inflate_buffer, M_TEMP);
770abort_with_zs:
771	inflateEnd(&zs);
772abort_with_fw:
773	firmware_put(fw, FIRMWARE_UNLOAD);
774	return status;
775}
776
777/*
778 * Enable or disable periodic RDMAs from the host to make certain
779 * chipsets resend dropped PCIe messages
780 */
781
782static void
783mxge_dummy_rdma(mxge_softc_t *sc, int enable)
784{
785	char buf_bytes[72];
786	volatile uint32_t *confirm;
787	volatile char *submit;
788	uint32_t *buf, dma_low, dma_high;
789	int i;
790
791	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
792
793	/* clear confirmation addr */
794	confirm = (volatile uint32_t *)sc->cmd;
795	*confirm = 0;
796	wmb();
797
798	/* send an rdma command to the PCIe engine, and wait for the
799	   response in the confirmation address.  The firmware should
800	   write a -1 there to indicate it is alive and well
801	*/
802
803	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
804	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
805	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
806	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
807	buf[2] = htobe32(0xffffffff);		/* confirm data */
808	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
809	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
810	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
811	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
812	buf[5] = htobe32(enable);			/* enable? */
813
814
815	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
816
817	mxge_pio_copy(submit, buf, 64);
818	wmb();
819	DELAY(1000);
820	wmb();
821	i = 0;
822	while (*confirm != 0xffffffff && i < 20) {
823		DELAY(1000);
824		i++;
825	}
826	if (*confirm != 0xffffffff) {
827		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
828			      (enable ? "enable" : "disable"), confirm,
829			      *confirm);
830	}
831	return;
832}
833
834static int
835mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
836{
837	mcp_cmd_t *buf;
838	char buf_bytes[sizeof(*buf) + 8];
839	volatile mcp_cmd_response_t *response = sc->cmd;
840	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
841	uint32_t dma_low, dma_high;
842	int err, sleep_total = 0;
843
844	/* ensure buf is aligned to 8 bytes */
845	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
846
847	buf->data0 = htobe32(data->data0);
848	buf->data1 = htobe32(data->data1);
849	buf->data2 = htobe32(data->data2);
850	buf->cmd = htobe32(cmd);
851	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
852	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
853
854	buf->response_addr.low = htobe32(dma_low);
855	buf->response_addr.high = htobe32(dma_high);
856	mtx_lock(&sc->cmd_mtx);
857	response->result = 0xffffffff;
858	wmb();
859	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
860
861	/* wait up to 20ms */
862	err = EAGAIN;
863	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
864		bus_dmamap_sync(sc->cmd_dma.dmat,
865				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
866		wmb();
867		switch (be32toh(response->result)) {
868		case 0:
869			data->data0 = be32toh(response->data);
870			err = 0;
871			break;
872		case 0xffffffff:
873			DELAY(1000);
874			break;
875		case MXGEFW_CMD_UNKNOWN:
876			err = ENOSYS;
877			break;
878		case MXGEFW_CMD_ERROR_UNALIGNED:
879			err = E2BIG;
880			break;
881		case MXGEFW_CMD_ERROR_BUSY:
882			err = EBUSY;
883			break;
884		case MXGEFW_CMD_ERROR_I2C_ABSENT:
885			err = ENXIO;
886			break;
887		default:
888			device_printf(sc->dev,
889				      "mxge: command %d "
890				      "failed, result = %d\n",
891				      cmd, be32toh(response->result));
892			err = ENXIO;
893			break;
894		}
895		if (err != EAGAIN)
896			break;
897	}
898	if (err == EAGAIN)
899		device_printf(sc->dev, "mxge: command %d timed out"
900			      "result = %d\n",
901			      cmd, be32toh(response->result));
902	mtx_unlock(&sc->cmd_mtx);
903	return err;
904}
905
906static int
907mxge_adopt_running_firmware(mxge_softc_t *sc)
908{
909	struct mcp_gen_header *hdr;
910	const size_t bytes = sizeof (struct mcp_gen_header);
911	size_t hdr_offset;
912	int status;
913
914	/* find running firmware header */
915	hdr_offset = htobe32(*(volatile uint32_t *)
916			     (sc->sram + MCP_HEADER_PTR_OFFSET));
917
918	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
919		device_printf(sc->dev,
920			      "Running firmware has bad header offset (%d)\n",
921			      (int)hdr_offset);
922		return EIO;
923	}
924
925	/* copy header of running firmware from SRAM to host memory to
926	 * validate firmware */
927	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
928	if (hdr == NULL) {
929		device_printf(sc->dev, "could not malloc firmware hdr\n");
930		return ENOMEM;
931	}
932	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
933				rman_get_bushandle(sc->mem_res),
934				hdr_offset, (char *)hdr, bytes);
935	status = mxge_validate_firmware(sc, hdr);
936	free(hdr, M_DEVBUF);
937
938	/*
939	 * check to see if adopted firmware has bug where adopting
940	 * it will cause broadcasts to be filtered unless the NIC
941	 * is kept in ALLMULTI mode
942	 */
943	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
944	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
945		sc->adopted_rx_filter_bug = 1;
946		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
947			      "working around rx filter bug\n",
948			      sc->fw_ver_major, sc->fw_ver_minor,
949			      sc->fw_ver_tiny);
950	}
951
952	return status;
953}
954
955
956static int
957mxge_load_firmware(mxge_softc_t *sc, int adopt)
958{
959	volatile uint32_t *confirm;
960	volatile char *submit;
961	char buf_bytes[72];
962	uint32_t *buf, size, dma_low, dma_high;
963	int status, i;
964
965	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
966
967	size = sc->sram_size;
968	status = mxge_load_firmware_helper(sc, &size);
969	if (status) {
970		if (!adopt)
971			return status;
972		/* Try to use the currently running firmware, if
973		   it is new enough */
974		status = mxge_adopt_running_firmware(sc);
975		if (status) {
976			device_printf(sc->dev,
977				      "failed to adopt running firmware\n");
978			return status;
979		}
980		device_printf(sc->dev,
981			      "Successfully adopted running firmware\n");
982		if (sc->tx_boundary == 4096) {
983			device_printf(sc->dev,
984				"Using firmware currently running on NIC"
985				 ".  For optimal\n");
986			device_printf(sc->dev,
987				 "performance consider loading optimized "
988				 "firmware\n");
989		}
990		sc->fw_name = mxge_fw_unaligned;
991		sc->tx_boundary = 2048;
992		return 0;
993	}
994	/* clear confirmation addr */
995	confirm = (volatile uint32_t *)sc->cmd;
996	*confirm = 0;
997	wmb();
998	/* send a reload command to the bootstrap MCP, and wait for the
999	   response in the confirmation address.  The firmware should
1000	   write a -1 there to indicate it is alive and well
1001	*/
1002
1003	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1004	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1005
1006	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1007	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1008	buf[2] = htobe32(0xffffffff);	/* confirm data */
1009
1010	/* FIX: All newest firmware should un-protect the bottom of
1011	   the sram before handoff. However, the very first interfaces
1012	   do not. Therefore the handoff copy must skip the first 8 bytes
1013	*/
1014					/* where the code starts*/
1015	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1016	buf[4] = htobe32(size - 8); 	/* length of code */
1017	buf[5] = htobe32(8);		/* where to copy to */
1018	buf[6] = htobe32(0);		/* where to jump to */
1019
1020	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1021	mxge_pio_copy(submit, buf, 64);
1022	wmb();
1023	DELAY(1000);
1024	wmb();
1025	i = 0;
1026	while (*confirm != 0xffffffff && i < 20) {
1027		DELAY(1000*10);
1028		i++;
1029		bus_dmamap_sync(sc->cmd_dma.dmat,
1030				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1031	}
1032	if (*confirm != 0xffffffff) {
1033		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1034			confirm, *confirm);
1035
1036		return ENXIO;
1037	}
1038	return 0;
1039}
1040
1041static int
1042mxge_update_mac_address(mxge_softc_t *sc)
1043{
1044	mxge_cmd_t cmd;
1045	uint8_t *addr = sc->mac_addr;
1046	int status;
1047
1048
1049	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1050		     | (addr[2] << 8) | addr[3]);
1051
1052	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1053
1054	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1055	return status;
1056}
1057
1058static int
1059mxge_change_pause(mxge_softc_t *sc, int pause)
1060{
1061	mxge_cmd_t cmd;
1062	int status;
1063
1064	if (pause)
1065		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1066				       &cmd);
1067	else
1068		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1069				       &cmd);
1070
1071	if (status) {
1072		device_printf(sc->dev, "Failed to set flow control mode\n");
1073		return ENXIO;
1074	}
1075	sc->pause = pause;
1076	return 0;
1077}
1078
1079static void
1080mxge_change_promisc(mxge_softc_t *sc, int promisc)
1081{
1082	mxge_cmd_t cmd;
1083	int status;
1084
1085	if (mxge_always_promisc)
1086		promisc = 1;
1087
1088	if (promisc)
1089		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1090				       &cmd);
1091	else
1092		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1093				       &cmd);
1094
1095	if (status) {
1096		device_printf(sc->dev, "Failed to set promisc mode\n");
1097	}
1098}
1099
1100static void
1101mxge_set_multicast_list(mxge_softc_t *sc)
1102{
1103	mxge_cmd_t cmd;
1104	struct ifmultiaddr *ifma;
1105	struct ifnet *ifp = sc->ifp;
1106	int err;
1107
1108	/* This firmware is known to not support multicast */
1109	if (!sc->fw_multicast_support)
1110		return;
1111
1112	/* Disable multicast filtering while we play with the lists*/
1113	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1114	if (err != 0) {
1115		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1116		       " error status: %d\n", err);
1117		return;
1118	}
1119
1120	if (sc->adopted_rx_filter_bug)
1121		return;
1122
1123	if (ifp->if_flags & IFF_ALLMULTI)
1124		/* request to disable multicast filtering, so quit here */
1125		return;
1126
1127	/* Flush all the filters */
1128
1129	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1130	if (err != 0) {
1131		device_printf(sc->dev,
1132			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1133			      ", error status: %d\n", err);
1134		return;
1135	}
1136
1137	/* Walk the multicast list, and add each address */
1138
1139	if_maddr_rlock(ifp);
1140	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1141		if (ifma->ifma_addr->sa_family != AF_LINK)
1142			continue;
1143		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1144		      &cmd.data0, 4);
1145		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1146		      &cmd.data1, 2);
1147		cmd.data0 = htonl(cmd.data0);
1148		cmd.data1 = htonl(cmd.data1);
1149		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1150		if (err != 0) {
1151			device_printf(sc->dev, "Failed "
1152			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1153			       "%d\t", err);
1154			/* abort, leaving multicast filtering off */
1155			if_maddr_runlock(ifp);
1156			return;
1157		}
1158	}
1159	if_maddr_runlock(ifp);
1160	/* Enable multicast filtering */
1161	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1162	if (err != 0) {
1163		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1164		       ", error status: %d\n", err);
1165	}
1166}
1167
1168static int
1169mxge_max_mtu(mxge_softc_t *sc)
1170{
1171	mxge_cmd_t cmd;
1172	int status;
1173
1174	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1175		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1176
1177	/* try to set nbufs to see if it we can
1178	   use virtually contiguous jumbos */
1179	cmd.data0 = 0;
1180	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1181			       &cmd);
1182	if (status == 0)
1183		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1184
1185	/* otherwise, we're limited to MJUMPAGESIZE */
1186	return MJUMPAGESIZE - MXGEFW_PAD;
1187}
1188
1189static int
1190mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1191{
1192	struct mxge_slice_state *ss;
1193	mxge_rx_done_t *rx_done;
1194	volatile uint32_t *irq_claim;
1195	mxge_cmd_t cmd;
1196	int slice, status;
1197
1198	/* try to send a reset command to the card to see if it
1199	   is alive */
1200	memset(&cmd, 0, sizeof (cmd));
1201	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1202	if (status != 0) {
1203		device_printf(sc->dev, "failed reset\n");
1204		return ENXIO;
1205	}
1206
1207	mxge_dummy_rdma(sc, 1);
1208
1209
1210	/* set the intrq size */
1211	cmd.data0 = sc->rx_ring_size;
1212	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1213
1214	/*
1215	 * Even though we already know how many slices are supported
1216	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1217	 * has magic side effects, and must be called after a reset.
1218	 * It must be called prior to calling any RSS related cmds,
1219	 * including assigning an interrupt queue for anything but
1220	 * slice 0.  It must also be called *after*
1221	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1222	 * the firmware to compute offsets.
1223	 */
1224
1225	if (sc->num_slices > 1) {
1226		/* ask the maximum number of slices it supports */
1227		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1228					   &cmd);
1229		if (status != 0) {
1230			device_printf(sc->dev,
1231				      "failed to get number of slices\n");
1232			return status;
1233		}
1234		/*
1235		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1236		 * to setting up the interrupt queue DMA
1237		 */
1238		cmd.data0 = sc->num_slices;
1239		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1240#ifdef IFNET_BUF_RING
1241		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1242#endif
1243		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1244					   &cmd);
1245		if (status != 0) {
1246			device_printf(sc->dev,
1247				      "failed to set number of slices\n");
1248			return status;
1249		}
1250	}
1251
1252
1253	if (interrupts_setup) {
1254		/* Now exchange information about interrupts  */
1255		for (slice = 0; slice < sc->num_slices; slice++) {
1256			rx_done = &sc->ss[slice].rx_done;
1257			memset(rx_done->entry, 0, sc->rx_ring_size);
1258			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1259			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1260			cmd.data2 = slice;
1261			status |= mxge_send_cmd(sc,
1262						MXGEFW_CMD_SET_INTRQ_DMA,
1263						&cmd);
1264		}
1265	}
1266
1267	status |= mxge_send_cmd(sc,
1268				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1269
1270
1271	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1272
1273	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1274	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1275
1276
1277	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1278				&cmd);
1279	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1280	if (status != 0) {
1281		device_printf(sc->dev, "failed set interrupt parameters\n");
1282		return status;
1283	}
1284
1285
1286	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1287
1288
1289	/* run a DMA benchmark */
1290	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1291
1292	for (slice = 0; slice < sc->num_slices; slice++) {
1293		ss = &sc->ss[slice];
1294
1295		ss->irq_claim = irq_claim + (2 * slice);
1296		/* reset mcp/driver shared state back to 0 */
1297		ss->rx_done.idx = 0;
1298		ss->rx_done.cnt = 0;
1299		ss->tx.req = 0;
1300		ss->tx.done = 0;
1301		ss->tx.pkt_done = 0;
1302		ss->tx.queue_active = 0;
1303		ss->tx.activate = 0;
1304		ss->tx.deactivate = 0;
1305		ss->tx.wake = 0;
1306		ss->tx.defrag = 0;
1307		ss->tx.stall = 0;
1308		ss->rx_big.cnt = 0;
1309		ss->rx_small.cnt = 0;
1310		ss->lro_bad_csum = 0;
1311		ss->lro_queued = 0;
1312		ss->lro_flushed = 0;
1313		if (ss->fw_stats != NULL) {
1314			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1315		}
1316	}
1317	sc->rdma_tags_available = 15;
1318	status = mxge_update_mac_address(sc);
1319	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1320	mxge_change_pause(sc, sc->pause);
1321	mxge_set_multicast_list(sc);
1322	if (sc->throttle) {
1323		cmd.data0 = sc->throttle;
1324		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1325				  &cmd)) {
1326			device_printf(sc->dev,
1327				      "can't enable throttle\n");
1328		}
1329	}
1330	return status;
1331}
1332
1333static int
1334mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1335{
1336	mxge_cmd_t cmd;
1337	mxge_softc_t *sc;
1338	int err;
1339	unsigned int throttle;
1340
1341	sc = arg1;
1342	throttle = sc->throttle;
1343	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1344        if (err != 0) {
1345                return err;
1346        }
1347
1348	if (throttle == sc->throttle)
1349		return 0;
1350
1351        if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1352                return EINVAL;
1353
1354	mtx_lock(&sc->driver_mtx);
1355	cmd.data0 = throttle;
1356	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1357	if (err == 0)
1358		sc->throttle = throttle;
1359	mtx_unlock(&sc->driver_mtx);
1360	return err;
1361}
1362
1363static int
1364mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1365{
1366        mxge_softc_t *sc;
1367        unsigned int intr_coal_delay;
1368        int err;
1369
1370        sc = arg1;
1371        intr_coal_delay = sc->intr_coal_delay;
1372        err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1373        if (err != 0) {
1374                return err;
1375        }
1376        if (intr_coal_delay == sc->intr_coal_delay)
1377                return 0;
1378
1379        if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1380                return EINVAL;
1381
1382	mtx_lock(&sc->driver_mtx);
1383	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1384	sc->intr_coal_delay = intr_coal_delay;
1385
1386	mtx_unlock(&sc->driver_mtx);
1387        return err;
1388}
1389
1390static int
1391mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1392{
1393        mxge_softc_t *sc;
1394        unsigned int enabled;
1395        int err;
1396
1397        sc = arg1;
1398        enabled = sc->pause;
1399        err = sysctl_handle_int(oidp, &enabled, arg2, req);
1400        if (err != 0) {
1401                return err;
1402        }
1403        if (enabled == sc->pause)
1404                return 0;
1405
1406	mtx_lock(&sc->driver_mtx);
1407	err = mxge_change_pause(sc, enabled);
1408	mtx_unlock(&sc->driver_mtx);
1409        return err;
1410}
1411
1412static int
1413mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1414{
1415	struct ifnet *ifp;
1416	int err = 0;
1417
1418	ifp = sc->ifp;
1419	if (lro_cnt == 0)
1420		ifp->if_capenable &= ~IFCAP_LRO;
1421	else
1422		ifp->if_capenable |= IFCAP_LRO;
1423	sc->lro_cnt = lro_cnt;
1424	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1425		mxge_close(sc, 0);
1426		err = mxge_open(sc);
1427	}
1428	return err;
1429}
1430
1431static int
1432mxge_change_lro(SYSCTL_HANDLER_ARGS)
1433{
1434	mxge_softc_t *sc;
1435	unsigned int lro_cnt;
1436	int err;
1437
1438	sc = arg1;
1439	lro_cnt = sc->lro_cnt;
1440	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1441	if (err != 0)
1442		return err;
1443
1444	if (lro_cnt == sc->lro_cnt)
1445		return 0;
1446
1447	if (lro_cnt > 128)
1448		return EINVAL;
1449
1450	mtx_lock(&sc->driver_mtx);
1451	err = mxge_change_lro_locked(sc, lro_cnt);
1452	mtx_unlock(&sc->driver_mtx);
1453	return err;
1454}
1455
1456static int
1457mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1458{
1459        int err;
1460
1461        if (arg1 == NULL)
1462                return EFAULT;
1463        arg2 = be32toh(*(int *)arg1);
1464        arg1 = NULL;
1465        err = sysctl_handle_int(oidp, arg1, arg2, req);
1466
1467        return err;
1468}
1469
1470static void
1471mxge_rem_sysctls(mxge_softc_t *sc)
1472{
1473	struct mxge_slice_state *ss;
1474	int slice;
1475
1476	if (sc->slice_sysctl_tree == NULL)
1477		return;
1478
1479	for (slice = 0; slice < sc->num_slices; slice++) {
1480		ss = &sc->ss[slice];
1481		if (ss == NULL || ss->sysctl_tree == NULL)
1482			continue;
1483		sysctl_ctx_free(&ss->sysctl_ctx);
1484		ss->sysctl_tree = NULL;
1485	}
1486	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1487	sc->slice_sysctl_tree = NULL;
1488}
1489
1490static void
1491mxge_add_sysctls(mxge_softc_t *sc)
1492{
1493	struct sysctl_ctx_list *ctx;
1494	struct sysctl_oid_list *children;
1495	mcp_irq_data_t *fw;
1496	struct mxge_slice_state *ss;
1497	int slice;
1498	char slice_num[8];
1499
1500	ctx = device_get_sysctl_ctx(sc->dev);
1501	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1502	fw = sc->ss[0].fw_stats;
1503
1504	/* random information */
1505	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1506		       "firmware_version",
1507		       CTLFLAG_RD, &sc->fw_version,
1508		       0, "firmware version");
1509	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1510		       "serial_number",
1511		       CTLFLAG_RD, &sc->serial_number_string,
1512		       0, "serial number");
1513	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1514		       "product_code",
1515		       CTLFLAG_RD, &sc->product_code_string,
1516		       0, "product_code");
1517	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1518		       "pcie_link_width",
1519		       CTLFLAG_RD, &sc->link_width,
1520		       0, "tx_boundary");
1521	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1522		       "tx_boundary",
1523		       CTLFLAG_RD, &sc->tx_boundary,
1524		       0, "tx_boundary");
1525	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1526		       "write_combine",
1527		       CTLFLAG_RD, &sc->wc,
1528		       0, "write combining PIO?");
1529	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1530		       "read_dma_MBs",
1531		       CTLFLAG_RD, &sc->read_dma,
1532		       0, "DMA Read speed in MB/s");
1533	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1534		       "write_dma_MBs",
1535		       CTLFLAG_RD, &sc->write_dma,
1536		       0, "DMA Write speed in MB/s");
1537	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1538		       "read_write_dma_MBs",
1539		       CTLFLAG_RD, &sc->read_write_dma,
1540		       0, "DMA concurrent Read/Write speed in MB/s");
1541	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1542		       "watchdog_resets",
1543		       CTLFLAG_RD, &sc->watchdog_resets,
1544		       0, "Number of times NIC was reset");
1545
1546
1547	/* performance related tunables */
1548	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549			"intr_coal_delay",
1550			CTLTYPE_INT|CTLFLAG_RW, sc,
1551			0, mxge_change_intr_coal,
1552			"I", "interrupt coalescing delay in usecs");
1553
1554	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1555			"throttle",
1556			CTLTYPE_INT|CTLFLAG_RW, sc,
1557			0, mxge_change_throttle,
1558			"I", "transmit throttling");
1559
1560	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1561			"flow_control_enabled",
1562			CTLTYPE_INT|CTLFLAG_RW, sc,
1563			0, mxge_change_flow_control,
1564			"I", "interrupt coalescing delay in usecs");
1565
1566	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1567		       "deassert_wait",
1568		       CTLFLAG_RW, &mxge_deassert_wait,
1569		       0, "Wait for IRQ line to go low in ihandler");
1570
1571	/* stats block from firmware is in network byte order.
1572	   Need to swap it */
1573	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1574			"link_up",
1575			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1576			0, mxge_handle_be32,
1577			"I", "link up");
1578	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1579			"rdma_tags_available",
1580			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1581			0, mxge_handle_be32,
1582			"I", "rdma_tags_available");
1583	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1584			"dropped_bad_crc32",
1585			CTLTYPE_INT|CTLFLAG_RD,
1586			&fw->dropped_bad_crc32,
1587			0, mxge_handle_be32,
1588			"I", "dropped_bad_crc32");
1589	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1590			"dropped_bad_phy",
1591			CTLTYPE_INT|CTLFLAG_RD,
1592			&fw->dropped_bad_phy,
1593			0, mxge_handle_be32,
1594			"I", "dropped_bad_phy");
1595	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1596			"dropped_link_error_or_filtered",
1597			CTLTYPE_INT|CTLFLAG_RD,
1598			&fw->dropped_link_error_or_filtered,
1599			0, mxge_handle_be32,
1600			"I", "dropped_link_error_or_filtered");
1601	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1602			"dropped_link_overflow",
1603			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1604			0, mxge_handle_be32,
1605			"I", "dropped_link_overflow");
1606	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1607			"dropped_multicast_filtered",
1608			CTLTYPE_INT|CTLFLAG_RD,
1609			&fw->dropped_multicast_filtered,
1610			0, mxge_handle_be32,
1611			"I", "dropped_multicast_filtered");
1612	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1613			"dropped_no_big_buffer",
1614			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1615			0, mxge_handle_be32,
1616			"I", "dropped_no_big_buffer");
1617	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1618			"dropped_no_small_buffer",
1619			CTLTYPE_INT|CTLFLAG_RD,
1620			&fw->dropped_no_small_buffer,
1621			0, mxge_handle_be32,
1622			"I", "dropped_no_small_buffer");
1623	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1624			"dropped_overrun",
1625			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1626			0, mxge_handle_be32,
1627			"I", "dropped_overrun");
1628	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1629			"dropped_pause",
1630			CTLTYPE_INT|CTLFLAG_RD,
1631			&fw->dropped_pause,
1632			0, mxge_handle_be32,
1633			"I", "dropped_pause");
1634	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1635			"dropped_runt",
1636			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1637			0, mxge_handle_be32,
1638			"I", "dropped_runt");
1639
1640	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1641			"dropped_unicast_filtered",
1642			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1643			0, mxge_handle_be32,
1644			"I", "dropped_unicast_filtered");
1645
1646	/* verbose printing? */
1647	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1648		       "verbose",
1649		       CTLFLAG_RW, &mxge_verbose,
1650		       0, "verbose printing");
1651
1652	/* lro */
1653	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1654			"lro_cnt",
1655			CTLTYPE_INT|CTLFLAG_RW, sc,
1656			0, mxge_change_lro,
1657			"I", "number of lro merge queues");
1658
1659
1660	/* add counters exported for debugging from all slices */
1661	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1662	sc->slice_sysctl_tree =
1663		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1664				"slice", CTLFLAG_RD, 0, "");
1665
1666	for (slice = 0; slice < sc->num_slices; slice++) {
1667		ss = &sc->ss[slice];
1668		sysctl_ctx_init(&ss->sysctl_ctx);
1669		ctx = &ss->sysctl_ctx;
1670		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1671		sprintf(slice_num, "%d", slice);
1672		ss->sysctl_tree =
1673			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1674					CTLFLAG_RD, 0, "");
1675		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1676		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1677			       "rx_small_cnt",
1678			       CTLFLAG_RD, &ss->rx_small.cnt,
1679			       0, "rx_small_cnt");
1680		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1681			       "rx_big_cnt",
1682			       CTLFLAG_RD, &ss->rx_big.cnt,
1683			       0, "rx_small_cnt");
1684		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1685			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1686			       0, "number of lro merge queues flushed");
1687
1688		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1689			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1690			       0, "number of frames appended to lro merge"
1691			       "queues");
1692
1693#ifndef IFNET_BUF_RING
1694		/* only transmit from slice 0 for now */
1695		if (slice > 0)
1696			continue;
1697#endif
1698		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1699			       "tx_req",
1700			       CTLFLAG_RD, &ss->tx.req,
1701			       0, "tx_req");
1702
1703		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1704			       "tx_done",
1705			       CTLFLAG_RD, &ss->tx.done,
1706			       0, "tx_done");
1707		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1708			       "tx_pkt_done",
1709			       CTLFLAG_RD, &ss->tx.pkt_done,
1710			       0, "tx_done");
1711		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1712			       "tx_stall",
1713			       CTLFLAG_RD, &ss->tx.stall,
1714			       0, "tx_stall");
1715		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1716			       "tx_wake",
1717			       CTLFLAG_RD, &ss->tx.wake,
1718			       0, "tx_wake");
1719		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1720			       "tx_defrag",
1721			       CTLFLAG_RD, &ss->tx.defrag,
1722			       0, "tx_defrag");
1723		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1724			       "tx_queue_active",
1725			       CTLFLAG_RD, &ss->tx.queue_active,
1726			       0, "tx_queue_active");
1727		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1728			       "tx_activate",
1729			       CTLFLAG_RD, &ss->tx.activate,
1730			       0, "tx_activate");
1731		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1732			       "tx_deactivate",
1733			       CTLFLAG_RD, &ss->tx.deactivate,
1734			       0, "tx_deactivate");
1735	}
1736}
1737
1738/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1739   backwards one at a time and handle ring wraps */
1740
1741static inline void
1742mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1743			    mcp_kreq_ether_send_t *src, int cnt)
1744{
1745        int idx, starting_slot;
1746        starting_slot = tx->req;
1747        while (cnt > 1) {
1748                cnt--;
1749                idx = (starting_slot + cnt) & tx->mask;
1750                mxge_pio_copy(&tx->lanai[idx],
1751			      &src[cnt], sizeof(*src));
1752                wmb();
1753        }
1754}
1755
1756/*
1757 * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1758 * at most 32 bytes at a time, so as to avoid involving the software
1759 * pio handler in the nic.   We re-write the first segment's flags
1760 * to mark them valid only after writing the entire chain
1761 */
1762
1763static inline void
1764mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1765                  int cnt)
1766{
1767        int idx, i;
1768        uint32_t *src_ints;
1769	volatile uint32_t *dst_ints;
1770        mcp_kreq_ether_send_t *srcp;
1771	volatile mcp_kreq_ether_send_t *dstp, *dst;
1772	uint8_t last_flags;
1773
1774        idx = tx->req & tx->mask;
1775
1776	last_flags = src->flags;
1777	src->flags = 0;
1778        wmb();
1779        dst = dstp = &tx->lanai[idx];
1780        srcp = src;
1781
1782        if ((idx + cnt) < tx->mask) {
1783                for (i = 0; i < (cnt - 1); i += 2) {
1784                        mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1785                        wmb(); /* force write every 32 bytes */
1786                        srcp += 2;
1787                        dstp += 2;
1788                }
1789        } else {
1790                /* submit all but the first request, and ensure
1791                   that it is submitted below */
1792                mxge_submit_req_backwards(tx, src, cnt);
1793                i = 0;
1794        }
1795        if (i < cnt) {
1796                /* submit the first request */
1797                mxge_pio_copy(dstp, srcp, sizeof(*src));
1798                wmb(); /* barrier before setting valid flag */
1799        }
1800
1801        /* re-write the last 32-bits with the valid flags */
1802        src->flags = last_flags;
1803        src_ints = (uint32_t *)src;
1804        src_ints+=3;
1805        dst_ints = (volatile uint32_t *)dst;
1806        dst_ints+=3;
1807        *dst_ints =  *src_ints;
1808        tx->req += cnt;
1809        wmb();
1810}
1811
1812#if IFCAP_TSO4
1813
1814static void
1815mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1816	       int busdma_seg_cnt, int ip_off)
1817{
1818	mxge_tx_ring_t *tx;
1819	mcp_kreq_ether_send_t *req;
1820	bus_dma_segment_t *seg;
1821	struct ip *ip;
1822	struct tcphdr *tcp;
1823	uint32_t low, high_swapped;
1824	int len, seglen, cum_len, cum_len_next;
1825	int next_is_first, chop, cnt, rdma_count, small;
1826	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1827	uint8_t flags, flags_next;
1828	static int once;
1829
1830	mss = m->m_pkthdr.tso_segsz;
1831
1832	/* negative cum_len signifies to the
1833	 * send loop that we are still in the
1834	 * header portion of the TSO packet.
1835	 */
1836
1837	/* ensure we have the ethernet, IP and TCP
1838	   header together in the first mbuf, copy
1839	   it to a scratch buffer if not */
1840	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1841		m_copydata(m, 0, ip_off + sizeof (*ip),
1842			   ss->scratch);
1843		ip = (struct ip *)(ss->scratch + ip_off);
1844	} else {
1845		ip = (struct ip *)(mtod(m, char *) + ip_off);
1846	}
1847	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1848			    + sizeof (*tcp))) {
1849		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1850			   + sizeof (*tcp),  ss->scratch);
1851		ip = (struct ip *)(mtod(m, char *) + ip_off);
1852	}
1853
1854	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1855	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1856	cksum_offset = ip_off + (ip->ip_hl << 2);
1857
1858	/* TSO implies checksum offload on this hardware */
1859	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP)) == 0)) {
1860		/*
1861		 * If packet has full TCP csum, replace it with pseudo hdr
1862		 * sum that the NIC expects, otherwise the NIC will emit
1863		 * packets with bad TCP checksums.
1864		 */
1865		m->m_pkthdr.csum_flags = CSUM_TCP;
1866		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1867		tcp->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1868			htons(IPPROTO_TCP + (m->m_pkthdr.len - cksum_offset)));
1869	}
1870	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1871
1872
1873	/* for TSO, pseudo_hdr_offset holds mss.
1874	 * The firmware figures out where to put
1875	 * the checksum by parsing the header. */
1876	pseudo_hdr_offset = htobe16(mss);
1877
1878	tx = &ss->tx;
1879	req = tx->req_list;
1880	seg = tx->seg_list;
1881	cnt = 0;
1882	rdma_count = 0;
1883	/* "rdma_count" is the number of RDMAs belonging to the
1884	 * current packet BEFORE the current send request. For
1885	 * non-TSO packets, this is equal to "count".
1886	 * For TSO packets, rdma_count needs to be reset
1887	 * to 0 after a segment cut.
1888	 *
1889	 * The rdma_count field of the send request is
1890	 * the number of RDMAs of the packet starting at
1891	 * that request. For TSO send requests with one ore more cuts
1892	 * in the middle, this is the number of RDMAs starting
1893	 * after the last cut in the request. All previous
1894	 * segments before the last cut implicitly have 1 RDMA.
1895	 *
1896	 * Since the number of RDMAs is not known beforehand,
1897	 * it must be filled-in retroactively - after each
1898	 * segmentation cut or at the end of the entire packet.
1899	 */
1900
1901	while (busdma_seg_cnt) {
1902		/* Break the busdma segment up into pieces*/
1903		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1904		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1905		len = seg->ds_len;
1906
1907		while (len) {
1908			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1909			seglen = len;
1910			cum_len_next = cum_len + seglen;
1911			(req-rdma_count)->rdma_count = rdma_count + 1;
1912			if (__predict_true(cum_len >= 0)) {
1913				/* payload */
1914				chop = (cum_len_next > mss);
1915				cum_len_next = cum_len_next % mss;
1916				next_is_first = (cum_len_next == 0);
1917				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1918				flags_next |= next_is_first *
1919					MXGEFW_FLAGS_FIRST;
1920				rdma_count |= -(chop | next_is_first);
1921				rdma_count += chop & !next_is_first;
1922			} else if (cum_len_next >= 0) {
1923				/* header ends */
1924				rdma_count = -1;
1925				cum_len_next = 0;
1926				seglen = -cum_len;
1927				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1928				flags_next = MXGEFW_FLAGS_TSO_PLD |
1929					MXGEFW_FLAGS_FIRST |
1930					(small * MXGEFW_FLAGS_SMALL);
1931			    }
1932
1933			req->addr_high = high_swapped;
1934			req->addr_low = htobe32(low);
1935			req->pseudo_hdr_offset = pseudo_hdr_offset;
1936			req->pad = 0;
1937			req->rdma_count = 1;
1938			req->length = htobe16(seglen);
1939			req->cksum_offset = cksum_offset;
1940			req->flags = flags | ((cum_len & 1) *
1941					      MXGEFW_FLAGS_ALIGN_ODD);
1942			low += seglen;
1943			len -= seglen;
1944			cum_len = cum_len_next;
1945			flags = flags_next;
1946			req++;
1947			cnt++;
1948			rdma_count++;
1949			if (__predict_false(cksum_offset > seglen))
1950				cksum_offset -= seglen;
1951			else
1952				cksum_offset = 0;
1953			if (__predict_false(cnt > tx->max_desc))
1954				goto drop;
1955		}
1956		busdma_seg_cnt--;
1957		seg++;
1958	}
1959	(req-rdma_count)->rdma_count = rdma_count;
1960
1961	do {
1962		req--;
1963		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1964	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1965
1966	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1967	mxge_submit_req(tx, tx->req_list, cnt);
1968#ifdef IFNET_BUF_RING
1969	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1970		/* tell the NIC to start polling this slice */
1971		*tx->send_go = 1;
1972		tx->queue_active = 1;
1973		tx->activate++;
1974		wmb();
1975	}
1976#endif
1977	return;
1978
1979drop:
1980	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1981	m_freem(m);
1982	ss->oerrors++;
1983	if (!once) {
1984		printf("tx->max_desc exceeded via TSO!\n");
1985		printf("mss = %d, %ld, %d!\n", mss,
1986		       (long)seg - (long)tx->seg_list, tx->max_desc);
1987		once = 1;
1988	}
1989	return;
1990
1991}
1992
1993#endif /* IFCAP_TSO4 */
1994
1995#ifdef MXGE_NEW_VLAN_API
1996/*
1997 * We reproduce the software vlan tag insertion from
1998 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1999 * vlan tag insertion. We need to advertise this in order to have the
2000 * vlan interface respect our csum offload flags.
2001 */
2002static struct mbuf *
2003mxge_vlan_tag_insert(struct mbuf *m)
2004{
2005	struct ether_vlan_header *evl;
2006
2007	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
2008	if (__predict_false(m == NULL))
2009		return NULL;
2010	if (m->m_len < sizeof(*evl)) {
2011		m = m_pullup(m, sizeof(*evl));
2012		if (__predict_false(m == NULL))
2013			return NULL;
2014	}
2015	/*
2016	 * Transform the Ethernet header into an Ethernet header
2017	 * with 802.1Q encapsulation.
2018	 */
2019	evl = mtod(m, struct ether_vlan_header *);
2020	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2021	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2022	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2023	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2024	m->m_flags &= ~M_VLANTAG;
2025	return m;
2026}
2027#endif /* MXGE_NEW_VLAN_API */
2028
2029static void
2030mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2031{
2032	mxge_softc_t *sc;
2033	mcp_kreq_ether_send_t *req;
2034	bus_dma_segment_t *seg;
2035	struct mbuf *m_tmp;
2036	struct ifnet *ifp;
2037	mxge_tx_ring_t *tx;
2038	struct ip *ip;
2039	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2040	uint16_t pseudo_hdr_offset;
2041        uint8_t flags, cksum_offset;
2042
2043
2044	sc = ss->sc;
2045	ifp = sc->ifp;
2046	tx = &ss->tx;
2047
2048	ip_off = sizeof (struct ether_header);
2049#ifdef MXGE_NEW_VLAN_API
2050	if (m->m_flags & M_VLANTAG) {
2051		m = mxge_vlan_tag_insert(m);
2052		if (__predict_false(m == NULL))
2053			goto drop;
2054		ip_off += ETHER_VLAN_ENCAP_LEN;
2055	}
2056#endif
2057	/* (try to) map the frame for DMA */
2058	idx = tx->req & tx->mask;
2059	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2060				      m, tx->seg_list, &cnt,
2061				      BUS_DMA_NOWAIT);
2062	if (__predict_false(err == EFBIG)) {
2063		/* Too many segments in the chain.  Try
2064		   to defrag */
2065		m_tmp = m_defrag(m, M_NOWAIT);
2066		if (m_tmp == NULL) {
2067			goto drop;
2068		}
2069		ss->tx.defrag++;
2070		m = m_tmp;
2071		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2072					      tx->info[idx].map,
2073					      m, tx->seg_list, &cnt,
2074					      BUS_DMA_NOWAIT);
2075	}
2076	if (__predict_false(err != 0)) {
2077		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2078			      " packet len = %d\n", err, m->m_pkthdr.len);
2079		goto drop;
2080	}
2081	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2082			BUS_DMASYNC_PREWRITE);
2083	tx->info[idx].m = m;
2084
2085#if IFCAP_TSO4
2086	/* TSO is different enough, we handle it in another routine */
2087	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2088		mxge_encap_tso(ss, m, cnt, ip_off);
2089		return;
2090	}
2091#endif
2092
2093	req = tx->req_list;
2094	cksum_offset = 0;
2095	pseudo_hdr_offset = 0;
2096	flags = MXGEFW_FLAGS_NO_TSO;
2097
2098	/* checksum offloading? */
2099	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2100		/* ensure ip header is in first mbuf, copy
2101		   it to a scratch buffer if not */
2102		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2103			m_copydata(m, 0, ip_off + sizeof (*ip),
2104				   ss->scratch);
2105			ip = (struct ip *)(ss->scratch + ip_off);
2106		} else {
2107			ip = (struct ip *)(mtod(m, char *) + ip_off);
2108		}
2109		cksum_offset = ip_off + (ip->ip_hl << 2);
2110		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2111		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2112		req->cksum_offset = cksum_offset;
2113		flags |= MXGEFW_FLAGS_CKSUM;
2114		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2115	} else {
2116		odd_flag = 0;
2117	}
2118	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2119		flags |= MXGEFW_FLAGS_SMALL;
2120
2121	/* convert segments into a request list */
2122	cum_len = 0;
2123	seg = tx->seg_list;
2124	req->flags = MXGEFW_FLAGS_FIRST;
2125	for (i = 0; i < cnt; i++) {
2126		req->addr_low =
2127			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2128		req->addr_high =
2129			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2130		req->length = htobe16(seg->ds_len);
2131		req->cksum_offset = cksum_offset;
2132		if (cksum_offset > seg->ds_len)
2133			cksum_offset -= seg->ds_len;
2134		else
2135			cksum_offset = 0;
2136		req->pseudo_hdr_offset = pseudo_hdr_offset;
2137		req->pad = 0; /* complete solid 16-byte block */
2138		req->rdma_count = 1;
2139		req->flags |= flags | ((cum_len & 1) * odd_flag);
2140		cum_len += seg->ds_len;
2141		seg++;
2142		req++;
2143		req->flags = 0;
2144	}
2145	req--;
2146	/* pad runts to 60 bytes */
2147	if (cum_len < 60) {
2148		req++;
2149		req->addr_low =
2150			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2151		req->addr_high =
2152			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2153		req->length = htobe16(60 - cum_len);
2154		req->cksum_offset = 0;
2155		req->pseudo_hdr_offset = pseudo_hdr_offset;
2156		req->pad = 0; /* complete solid 16-byte block */
2157		req->rdma_count = 1;
2158		req->flags |= flags | ((cum_len & 1) * odd_flag);
2159		cnt++;
2160	}
2161
2162	tx->req_list[0].rdma_count = cnt;
2163#if 0
2164	/* print what the firmware will see */
2165	for (i = 0; i < cnt; i++) {
2166		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2167		    "cso:%d, flags:0x%x, rdma:%d\n",
2168		    i, (int)ntohl(tx->req_list[i].addr_high),
2169		    (int)ntohl(tx->req_list[i].addr_low),
2170		    (int)ntohs(tx->req_list[i].length),
2171		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2172		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2173		    tx->req_list[i].rdma_count);
2174	}
2175	printf("--------------\n");
2176#endif
2177	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2178	mxge_submit_req(tx, tx->req_list, cnt);
2179#ifdef IFNET_BUF_RING
2180	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2181		/* tell the NIC to start polling this slice */
2182		*tx->send_go = 1;
2183		tx->queue_active = 1;
2184		tx->activate++;
2185		wmb();
2186	}
2187#endif
2188	return;
2189
2190drop:
2191	m_freem(m);
2192	ss->oerrors++;
2193	return;
2194}
2195
2196#ifdef IFNET_BUF_RING
2197static void
2198mxge_qflush(struct ifnet *ifp)
2199{
2200	mxge_softc_t *sc = ifp->if_softc;
2201	mxge_tx_ring_t *tx;
2202	struct mbuf *m;
2203	int slice;
2204
2205	for (slice = 0; slice < sc->num_slices; slice++) {
2206		tx = &sc->ss[slice].tx;
2207		mtx_lock(&tx->mtx);
2208		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2209			m_freem(m);
2210		mtx_unlock(&tx->mtx);
2211	}
2212	if_qflush(ifp);
2213}
2214
2215static inline void
2216mxge_start_locked(struct mxge_slice_state *ss)
2217{
2218	mxge_softc_t *sc;
2219	struct mbuf *m;
2220	struct ifnet *ifp;
2221	mxge_tx_ring_t *tx;
2222
2223	sc = ss->sc;
2224	ifp = sc->ifp;
2225	tx = &ss->tx;
2226
2227	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2228		m = drbr_dequeue(ifp, tx->br);
2229		if (m == NULL) {
2230			return;
2231		}
2232		/* let BPF see it */
2233		BPF_MTAP(ifp, m);
2234
2235		/* give it to the nic */
2236		mxge_encap(ss, m);
2237	}
2238	/* ran out of transmit slots */
2239	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2240	    && (!drbr_empty(ifp, tx->br))) {
2241		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2242		tx->stall++;
2243	}
2244}
2245
2246static int
2247mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2248{
2249	mxge_softc_t *sc;
2250	struct ifnet *ifp;
2251	mxge_tx_ring_t *tx;
2252	int err;
2253
2254	sc = ss->sc;
2255	ifp = sc->ifp;
2256	tx = &ss->tx;
2257
2258	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2259	    IFF_DRV_RUNNING) {
2260		err = drbr_enqueue(ifp, tx->br, m);
2261		return (err);
2262	}
2263
2264	if (!drbr_needs_enqueue(ifp, tx->br) &&
2265	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2266		/* let BPF see it */
2267		BPF_MTAP(ifp, m);
2268		/* give it to the nic */
2269		mxge_encap(ss, m);
2270	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2271		return (err);
2272	}
2273	if (!drbr_empty(ifp, tx->br))
2274		mxge_start_locked(ss);
2275	return (0);
2276}
2277
2278static int
2279mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2280{
2281	mxge_softc_t *sc = ifp->if_softc;
2282	struct mxge_slice_state *ss;
2283	mxge_tx_ring_t *tx;
2284	int err = 0;
2285	int slice;
2286
2287	slice = m->m_pkthdr.flowid;
2288	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2289
2290	ss = &sc->ss[slice];
2291	tx = &ss->tx;
2292
2293	if (mtx_trylock(&tx->mtx)) {
2294		err = mxge_transmit_locked(ss, m);
2295		mtx_unlock(&tx->mtx);
2296	} else {
2297		err = drbr_enqueue(ifp, tx->br, m);
2298	}
2299
2300	return (err);
2301}
2302
2303#else
2304
2305static inline void
2306mxge_start_locked(struct mxge_slice_state *ss)
2307{
2308	mxge_softc_t *sc;
2309	struct mbuf *m;
2310	struct ifnet *ifp;
2311	mxge_tx_ring_t *tx;
2312
2313	sc = ss->sc;
2314	ifp = sc->ifp;
2315	tx = &ss->tx;
2316	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2317		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2318		if (m == NULL) {
2319			return;
2320		}
2321		/* let BPF see it */
2322		BPF_MTAP(ifp, m);
2323
2324		/* give it to the nic */
2325		mxge_encap(ss, m);
2326	}
2327	/* ran out of transmit slots */
2328	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2329		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2330		tx->stall++;
2331	}
2332}
2333#endif
2334static void
2335mxge_start(struct ifnet *ifp)
2336{
2337	mxge_softc_t *sc = ifp->if_softc;
2338	struct mxge_slice_state *ss;
2339
2340	/* only use the first slice for now */
2341	ss = &sc->ss[0];
2342	mtx_lock(&ss->tx.mtx);
2343	mxge_start_locked(ss);
2344	mtx_unlock(&ss->tx.mtx);
2345}
2346
2347/*
2348 * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2349 * at most 32 bytes at a time, so as to avoid involving the software
2350 * pio handler in the nic.   We re-write the first segment's low
2351 * DMA address to mark it valid only after we write the entire chunk
2352 * in a burst
2353 */
2354static inline void
2355mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2356		mcp_kreq_ether_recv_t *src)
2357{
2358	uint32_t low;
2359
2360	low = src->addr_low;
2361	src->addr_low = 0xffffffff;
2362	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2363	wmb();
2364	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2365	wmb();
2366	src->addr_low = low;
2367	dst->addr_low = low;
2368	wmb();
2369}
2370
2371static int
2372mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2373{
2374	bus_dma_segment_t seg;
2375	struct mbuf *m;
2376	mxge_rx_ring_t *rx = &ss->rx_small;
2377	int cnt, err;
2378
2379	m = m_gethdr(M_DONTWAIT, MT_DATA);
2380	if (m == NULL) {
2381		rx->alloc_fail++;
2382		err = ENOBUFS;
2383		goto done;
2384	}
2385	m->m_len = MHLEN;
2386	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2387				      &seg, &cnt, BUS_DMA_NOWAIT);
2388	if (err != 0) {
2389		m_free(m);
2390		goto done;
2391	}
2392	rx->info[idx].m = m;
2393	rx->shadow[idx].addr_low =
2394		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2395	rx->shadow[idx].addr_high =
2396		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2397
2398done:
2399	if ((idx & 7) == 7)
2400		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2401	return err;
2402}
2403
2404static int
2405mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2406{
2407	bus_dma_segment_t seg[3];
2408	struct mbuf *m;
2409	mxge_rx_ring_t *rx = &ss->rx_big;
2410	int cnt, err, i;
2411
2412	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2413	if (m == NULL) {
2414		rx->alloc_fail++;
2415		err = ENOBUFS;
2416		goto done;
2417	}
2418	m->m_len = rx->mlen;
2419	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2420				      seg, &cnt, BUS_DMA_NOWAIT);
2421	if (err != 0) {
2422		m_free(m);
2423		goto done;
2424	}
2425	rx->info[idx].m = m;
2426	rx->shadow[idx].addr_low =
2427		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2428	rx->shadow[idx].addr_high =
2429		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2430
2431#if MXGE_VIRT_JUMBOS
2432	for (i = 1; i < cnt; i++) {
2433		rx->shadow[idx + i].addr_low =
2434			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2435		rx->shadow[idx + i].addr_high =
2436			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2437       }
2438#endif
2439
2440done:
2441       for (i = 0; i < rx->nbufs; i++) {
2442		if ((idx & 7) == 7) {
2443			mxge_submit_8rx(&rx->lanai[idx - 7],
2444					&rx->shadow[idx - 7]);
2445		}
2446		idx++;
2447	}
2448	return err;
2449}
2450
2451/*
2452 *  Myri10GE hardware checksums are not valid if the sender
2453 *  padded the frame with non-zero padding.  This is because
2454 *  the firmware just does a simple 16-bit 1s complement
2455 *  checksum across the entire frame, excluding the first 14
2456 *  bytes.  It is best to simply to check the checksum and
2457 *  tell the stack about it only if the checksum is good
2458 */
2459
2460static inline uint16_t
2461mxge_rx_csum(struct mbuf *m, int csum)
2462{
2463	struct ether_header *eh;
2464	struct ip *ip;
2465	uint16_t c;
2466
2467	eh = mtod(m, struct ether_header *);
2468
2469	/* only deal with IPv4 TCP & UDP for now */
2470	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2471		return 1;
2472	ip = (struct ip *)(eh + 1);
2473	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2474			    ip->ip_p != IPPROTO_UDP))
2475		return 1;
2476#ifdef INET
2477	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2478		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2479			    - (ip->ip_hl << 2) + ip->ip_p));
2480#else
2481	c = 1;
2482#endif
2483	c ^= 0xffff;
2484	return (c);
2485}
2486
2487static void
2488mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2489{
2490	struct ether_vlan_header *evl;
2491	struct ether_header *eh;
2492	uint32_t partial;
2493
2494	evl = mtod(m, struct ether_vlan_header *);
2495	eh = mtod(m, struct ether_header *);
2496
2497	/*
2498	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2499	 * after what the firmware thought was the end of the ethernet
2500	 * header.
2501	 */
2502
2503	/* put checksum into host byte order */
2504	*csum = ntohs(*csum);
2505	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2506	(*csum) += ~partial;
2507	(*csum) +=  ((*csum) < ~partial);
2508	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2509	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2510
2511	/* restore checksum to network byte order;
2512	   later consumers expect this */
2513	*csum = htons(*csum);
2514
2515	/* save the tag */
2516#ifdef MXGE_NEW_VLAN_API
2517	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2518#else
2519	{
2520		struct m_tag *mtag;
2521		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2522				   M_NOWAIT);
2523		if (mtag == NULL)
2524			return;
2525		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2526		m_tag_prepend(m, mtag);
2527	}
2528
2529#endif
2530	m->m_flags |= M_VLANTAG;
2531
2532	/*
2533	 * Remove the 802.1q header by copying the Ethernet
2534	 * addresses over it and adjusting the beginning of
2535	 * the data in the mbuf.  The encapsulated Ethernet
2536	 * type field is already in place.
2537	 */
2538	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2539	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2540	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2541}
2542
2543
2544static inline void
2545mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2546{
2547	mxge_softc_t *sc;
2548	struct ifnet *ifp;
2549	struct mbuf *m;
2550	struct ether_header *eh;
2551	mxge_rx_ring_t *rx;
2552	bus_dmamap_t old_map;
2553	int idx;
2554	uint16_t tcpudp_csum;
2555
2556	sc = ss->sc;
2557	ifp = sc->ifp;
2558	rx = &ss->rx_big;
2559	idx = rx->cnt & rx->mask;
2560	rx->cnt += rx->nbufs;
2561	/* save a pointer to the received mbuf */
2562	m = rx->info[idx].m;
2563	/* try to replace the received mbuf */
2564	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2565		/* drop the frame -- the old mbuf is re-cycled */
2566		ifp->if_ierrors++;
2567		return;
2568	}
2569
2570	/* unmap the received buffer */
2571	old_map = rx->info[idx].map;
2572	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2573	bus_dmamap_unload(rx->dmat, old_map);
2574
2575	/* swap the bus_dmamap_t's */
2576	rx->info[idx].map = rx->extra_map;
2577	rx->extra_map = old_map;
2578
2579	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2580	 * aligned */
2581	m->m_data += MXGEFW_PAD;
2582
2583	m->m_pkthdr.rcvif = ifp;
2584	m->m_len = m->m_pkthdr.len = len;
2585	ss->ipackets++;
2586	eh = mtod(m, struct ether_header *);
2587	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2588		mxge_vlan_tag_remove(m, &csum);
2589	}
2590	/* if the checksum is valid, mark it in the mbuf header */
2591	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2592		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2593			return;
2594		/* otherwise, it was a UDP frame, or a TCP frame which
2595		   we could not do LRO on.  Tell the stack that the
2596		   checksum is good */
2597		m->m_pkthdr.csum_data = 0xffff;
2598		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2599	}
2600	/* flowid only valid if RSS hashing is enabled */
2601	if (sc->num_slices > 1) {
2602		m->m_pkthdr.flowid = (ss - sc->ss);
2603		m->m_flags |= M_FLOWID;
2604	}
2605	/* pass the frame up the stack */
2606	(*ifp->if_input)(ifp, m);
2607}
2608
2609static inline void
2610mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2611{
2612	mxge_softc_t *sc;
2613	struct ifnet *ifp;
2614	struct ether_header *eh;
2615	struct mbuf *m;
2616	mxge_rx_ring_t *rx;
2617	bus_dmamap_t old_map;
2618	int idx;
2619	uint16_t tcpudp_csum;
2620
2621	sc = ss->sc;
2622	ifp = sc->ifp;
2623	rx = &ss->rx_small;
2624	idx = rx->cnt & rx->mask;
2625	rx->cnt++;
2626	/* save a pointer to the received mbuf */
2627	m = rx->info[idx].m;
2628	/* try to replace the received mbuf */
2629	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2630		/* drop the frame -- the old mbuf is re-cycled */
2631		ifp->if_ierrors++;
2632		return;
2633	}
2634
2635	/* unmap the received buffer */
2636	old_map = rx->info[idx].map;
2637	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2638	bus_dmamap_unload(rx->dmat, old_map);
2639
2640	/* swap the bus_dmamap_t's */
2641	rx->info[idx].map = rx->extra_map;
2642	rx->extra_map = old_map;
2643
2644	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2645	 * aligned */
2646	m->m_data += MXGEFW_PAD;
2647
2648	m->m_pkthdr.rcvif = ifp;
2649	m->m_len = m->m_pkthdr.len = len;
2650	ss->ipackets++;
2651	eh = mtod(m, struct ether_header *);
2652	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2653		mxge_vlan_tag_remove(m, &csum);
2654	}
2655	/* if the checksum is valid, mark it in the mbuf header */
2656	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2657		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2658			return;
2659		/* otherwise, it was a UDP frame, or a TCP frame which
2660		   we could not do LRO on.  Tell the stack that the
2661		   checksum is good */
2662		m->m_pkthdr.csum_data = 0xffff;
2663		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2664	}
2665	/* flowid only valid if RSS hashing is enabled */
2666	if (sc->num_slices > 1) {
2667		m->m_pkthdr.flowid = (ss - sc->ss);
2668		m->m_flags |= M_FLOWID;
2669	}
2670	/* pass the frame up the stack */
2671	(*ifp->if_input)(ifp, m);
2672}
2673
2674static inline void
2675mxge_clean_rx_done(struct mxge_slice_state *ss)
2676{
2677	mxge_rx_done_t *rx_done = &ss->rx_done;
2678	int limit = 0;
2679	uint16_t length;
2680	uint16_t checksum;
2681
2682
2683	while (rx_done->entry[rx_done->idx].length != 0) {
2684		length = ntohs(rx_done->entry[rx_done->idx].length);
2685		rx_done->entry[rx_done->idx].length = 0;
2686		checksum = rx_done->entry[rx_done->idx].checksum;
2687		if (length <= (MHLEN - MXGEFW_PAD))
2688			mxge_rx_done_small(ss, length, checksum);
2689		else
2690			mxge_rx_done_big(ss, length, checksum);
2691		rx_done->cnt++;
2692		rx_done->idx = rx_done->cnt & rx_done->mask;
2693
2694		/* limit potential for livelock */
2695		if (__predict_false(++limit > rx_done->mask / 2))
2696			break;
2697	}
2698#ifdef INET
2699	while (!SLIST_EMPTY(&ss->lro_active)) {
2700		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2701		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2702		mxge_lro_flush(ss, lro);
2703	}
2704#endif
2705}
2706
2707
2708static inline void
2709mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2710{
2711	struct ifnet *ifp;
2712	mxge_tx_ring_t *tx;
2713	struct mbuf *m;
2714	bus_dmamap_t map;
2715	int idx;
2716	int *flags;
2717
2718	tx = &ss->tx;
2719	ifp = ss->sc->ifp;
2720	while (tx->pkt_done != mcp_idx) {
2721		idx = tx->done & tx->mask;
2722		tx->done++;
2723		m = tx->info[idx].m;
2724		/* mbuf and DMA map only attached to the first
2725		   segment per-mbuf */
2726		if (m != NULL) {
2727			ss->obytes += m->m_pkthdr.len;
2728			if (m->m_flags & M_MCAST)
2729				ss->omcasts++;
2730			ss->opackets++;
2731			tx->info[idx].m = NULL;
2732			map = tx->info[idx].map;
2733			bus_dmamap_unload(tx->dmat, map);
2734			m_freem(m);
2735		}
2736		if (tx->info[idx].flag) {
2737			tx->info[idx].flag = 0;
2738			tx->pkt_done++;
2739		}
2740	}
2741
2742	/* If we have space, clear IFF_OACTIVE to tell the stack that
2743           its OK to send packets */
2744#ifdef IFNET_BUF_RING
2745	flags = &ss->if_drv_flags;
2746#else
2747	flags = &ifp->if_drv_flags;
2748#endif
2749	mtx_lock(&ss->tx.mtx);
2750	if ((*flags) & IFF_DRV_OACTIVE &&
2751	    tx->req - tx->done < (tx->mask + 1)/4) {
2752		*(flags) &= ~IFF_DRV_OACTIVE;
2753		ss->tx.wake++;
2754		mxge_start_locked(ss);
2755	}
2756#ifdef IFNET_BUF_RING
2757	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2758		/* let the NIC stop polling this queue, since there
2759		 * are no more transmits pending */
2760		if (tx->req == tx->done) {
2761			*tx->send_stop = 1;
2762			tx->queue_active = 0;
2763			tx->deactivate++;
2764			wmb();
2765		}
2766	}
2767#endif
2768	mtx_unlock(&ss->tx.mtx);
2769
2770}
2771
2772static struct mxge_media_type mxge_xfp_media_types[] =
2773{
2774	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2775	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2776	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2777	{0,		(1 << 5),	"10GBASE-ER"},
2778	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2779	{0,		(1 << 3),	"10GBASE-SW"},
2780	{0,		(1 << 2),	"10GBASE-LW"},
2781	{0,		(1 << 1),	"10GBASE-EW"},
2782	{0,		(1 << 0),	"Reserved"}
2783};
2784static struct mxge_media_type mxge_sfp_media_types[] =
2785{
2786	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2787	{0,		(1 << 7),	"Reserved"},
2788	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2789	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2790	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2791	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2792};
2793
2794static void
2795mxge_media_set(mxge_softc_t *sc, int media_type)
2796{
2797
2798
2799	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2800		    0, NULL);
2801	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2802	sc->current_media = media_type;
2803	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2804}
2805
2806static void
2807mxge_media_init(mxge_softc_t *sc)
2808{
2809	char *ptr;
2810	int i;
2811
2812	ifmedia_removeall(&sc->media);
2813	mxge_media_set(sc, IFM_AUTO);
2814
2815	/*
2816	 * parse the product code to deterimine the interface type
2817	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2818	 * after the 3rd dash in the driver's cached copy of the
2819	 * EEPROM's product code string.
2820	 */
2821	ptr = sc->product_code_string;
2822	if (ptr == NULL) {
2823		device_printf(sc->dev, "Missing product code\n");
2824		return;
2825	}
2826
2827	for (i = 0; i < 3; i++, ptr++) {
2828		ptr = strchr(ptr, '-');
2829		if (ptr == NULL) {
2830			device_printf(sc->dev,
2831				      "only %d dashes in PC?!?\n", i);
2832			return;
2833		}
2834	}
2835	if (*ptr == 'C' || *(ptr +1) == 'C') {
2836		/* -C is CX4 */
2837		sc->connector = MXGE_CX4;
2838		mxge_media_set(sc, IFM_10G_CX4);
2839	} else if (*ptr == 'Q') {
2840		/* -Q is Quad Ribbon Fiber */
2841		sc->connector = MXGE_QRF;
2842		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2843		/* FreeBSD has no media type for Quad ribbon fiber */
2844	} else if (*ptr == 'R') {
2845		/* -R is XFP */
2846		sc->connector = MXGE_XFP;
2847	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2848		/* -S or -2S is SFP+ */
2849		sc->connector = MXGE_SFP;
2850	} else {
2851		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2852	}
2853}
2854
2855/*
2856 * Determine the media type for a NIC.  Some XFPs will identify
2857 * themselves only when their link is up, so this is initiated via a
2858 * link up interrupt.  However, this can potentially take up to
2859 * several milliseconds, so it is run via the watchdog routine, rather
2860 * than in the interrupt handler itself.
2861 */
2862static void
2863mxge_media_probe(mxge_softc_t *sc)
2864{
2865	mxge_cmd_t cmd;
2866	char *cage_type;
2867
2868	struct mxge_media_type *mxge_media_types = NULL;
2869	int i, err, ms, mxge_media_type_entries;
2870	uint32_t byte;
2871
2872	sc->need_media_probe = 0;
2873
2874	if (sc->connector == MXGE_XFP) {
2875		/* -R is XFP */
2876		mxge_media_types = mxge_xfp_media_types;
2877		mxge_media_type_entries =
2878			sizeof (mxge_xfp_media_types) /
2879			sizeof (mxge_xfp_media_types[0]);
2880		byte = MXGE_XFP_COMPLIANCE_BYTE;
2881		cage_type = "XFP";
2882	} else 	if (sc->connector == MXGE_SFP) {
2883		/* -S or -2S is SFP+ */
2884		mxge_media_types = mxge_sfp_media_types;
2885		mxge_media_type_entries =
2886			sizeof (mxge_sfp_media_types) /
2887			sizeof (mxge_sfp_media_types[0]);
2888		cage_type = "SFP+";
2889		byte = 3;
2890	} else {
2891		/* nothing to do; media type cannot change */
2892		return;
2893	}
2894
2895	/*
2896	 * At this point we know the NIC has an XFP cage, so now we
2897	 * try to determine what is in the cage by using the
2898	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2899	 * register.  We read just one byte, which may take over
2900	 * a millisecond
2901	 */
2902
2903	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2904	cmd.data1 = byte;
2905	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2906	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2907		device_printf(sc->dev, "failed to read XFP\n");
2908	}
2909	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2910		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2911	}
2912	if (err != MXGEFW_CMD_OK) {
2913		return;
2914	}
2915
2916	/* now we wait for the data to be cached */
2917	cmd.data0 = byte;
2918	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2919	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2920		DELAY(1000);
2921		cmd.data0 = byte;
2922		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2923	}
2924	if (err != MXGEFW_CMD_OK) {
2925		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2926			      cage_type, err, ms);
2927		return;
2928	}
2929
2930	if (cmd.data0 == mxge_media_types[0].bitmask) {
2931		if (mxge_verbose)
2932			device_printf(sc->dev, "%s:%s\n", cage_type,
2933				      mxge_media_types[0].name);
2934		if (sc->current_media != mxge_media_types[0].flag) {
2935			mxge_media_init(sc);
2936			mxge_media_set(sc, mxge_media_types[0].flag);
2937		}
2938		return;
2939	}
2940	for (i = 1; i < mxge_media_type_entries; i++) {
2941		if (cmd.data0 & mxge_media_types[i].bitmask) {
2942			if (mxge_verbose)
2943				device_printf(sc->dev, "%s:%s\n",
2944					      cage_type,
2945					      mxge_media_types[i].name);
2946
2947			if (sc->current_media != mxge_media_types[i].flag) {
2948				mxge_media_init(sc);
2949				mxge_media_set(sc, mxge_media_types[i].flag);
2950			}
2951			return;
2952		}
2953	}
2954	if (mxge_verbose)
2955		device_printf(sc->dev, "%s media 0x%x unknown\n",
2956			      cage_type, cmd.data0);
2957
2958	return;
2959}
2960
2961static void
2962mxge_intr(void *arg)
2963{
2964	struct mxge_slice_state *ss = arg;
2965	mxge_softc_t *sc = ss->sc;
2966	mcp_irq_data_t *stats = ss->fw_stats;
2967	mxge_tx_ring_t *tx = &ss->tx;
2968	mxge_rx_done_t *rx_done = &ss->rx_done;
2969	uint32_t send_done_count;
2970	uint8_t valid;
2971
2972
2973#ifndef IFNET_BUF_RING
2974	/* an interrupt on a non-zero slice is implicitly valid
2975	   since MSI-X irqs are not shared */
2976	if (ss != sc->ss) {
2977		mxge_clean_rx_done(ss);
2978		*ss->irq_claim = be32toh(3);
2979		return;
2980	}
2981#endif
2982
2983	/* make sure the DMA has finished */
2984	if (!stats->valid) {
2985		return;
2986	}
2987	valid = stats->valid;
2988
2989	if (sc->legacy_irq) {
2990		/* lower legacy IRQ  */
2991		*sc->irq_deassert = 0;
2992		if (!mxge_deassert_wait)
2993			/* don't wait for conf. that irq is low */
2994			stats->valid = 0;
2995	} else {
2996		stats->valid = 0;
2997	}
2998
2999	/* loop while waiting for legacy irq deassertion */
3000	do {
3001		/* check for transmit completes and receives */
3002		send_done_count = be32toh(stats->send_done_count);
3003		while ((send_done_count != tx->pkt_done) ||
3004		       (rx_done->entry[rx_done->idx].length != 0)) {
3005			if (send_done_count != tx->pkt_done)
3006				mxge_tx_done(ss, (int)send_done_count);
3007			mxge_clean_rx_done(ss);
3008			send_done_count = be32toh(stats->send_done_count);
3009		}
3010		if (sc->legacy_irq && mxge_deassert_wait)
3011			wmb();
3012	} while (*((volatile uint8_t *) &stats->valid));
3013
3014	/* fw link & error stats meaningful only on the first slice */
3015	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3016		if (sc->link_state != stats->link_up) {
3017			sc->link_state = stats->link_up;
3018			if (sc->link_state) {
3019				if_link_state_change(sc->ifp, LINK_STATE_UP);
3020				 sc->ifp->if_baudrate = IF_Gbps(10UL);
3021				if (mxge_verbose)
3022					device_printf(sc->dev, "link up\n");
3023			} else {
3024				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3025				sc->ifp->if_baudrate = 0;
3026				if (mxge_verbose)
3027					device_printf(sc->dev, "link down\n");
3028			}
3029			sc->need_media_probe = 1;
3030		}
3031		if (sc->rdma_tags_available !=
3032		    be32toh(stats->rdma_tags_available)) {
3033			sc->rdma_tags_available =
3034				be32toh(stats->rdma_tags_available);
3035			device_printf(sc->dev, "RDMA timed out! %d tags "
3036				      "left\n", sc->rdma_tags_available);
3037		}
3038
3039		if (stats->link_down) {
3040			sc->down_cnt += stats->link_down;
3041			sc->link_state = 0;
3042			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3043		}
3044	}
3045
3046	/* check to see if we have rx token to pass back */
3047	if (valid & 0x1)
3048	    *ss->irq_claim = be32toh(3);
3049	*(ss->irq_claim + 1) = be32toh(3);
3050}
3051
3052static void
3053mxge_init(void *arg)
3054{
3055	mxge_softc_t *sc = arg;
3056	struct ifnet *ifp = sc->ifp;
3057
3058
3059	mtx_lock(&sc->driver_mtx);
3060	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3061		(void) mxge_open(sc);
3062	mtx_unlock(&sc->driver_mtx);
3063}
3064
3065
3066
3067static void
3068mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3069{
3070	struct lro_entry *lro_entry;
3071	int i;
3072
3073	while (!SLIST_EMPTY(&ss->lro_free)) {
3074		lro_entry = SLIST_FIRST(&ss->lro_free);
3075		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3076		free(lro_entry, M_DEVBUF);
3077	}
3078
3079	for (i = 0; i <= ss->rx_big.mask; i++) {
3080		if (ss->rx_big.info[i].m == NULL)
3081			continue;
3082		bus_dmamap_unload(ss->rx_big.dmat,
3083				  ss->rx_big.info[i].map);
3084		m_freem(ss->rx_big.info[i].m);
3085		ss->rx_big.info[i].m = NULL;
3086	}
3087
3088	for (i = 0; i <= ss->rx_small.mask; i++) {
3089		if (ss->rx_small.info[i].m == NULL)
3090			continue;
3091		bus_dmamap_unload(ss->rx_small.dmat,
3092				  ss->rx_small.info[i].map);
3093		m_freem(ss->rx_small.info[i].m);
3094		ss->rx_small.info[i].m = NULL;
3095	}
3096
3097	/* transmit ring used only on the first slice */
3098	if (ss->tx.info == NULL)
3099		return;
3100
3101	for (i = 0; i <= ss->tx.mask; i++) {
3102		ss->tx.info[i].flag = 0;
3103		if (ss->tx.info[i].m == NULL)
3104			continue;
3105		bus_dmamap_unload(ss->tx.dmat,
3106				  ss->tx.info[i].map);
3107		m_freem(ss->tx.info[i].m);
3108		ss->tx.info[i].m = NULL;
3109	}
3110}
3111
3112static void
3113mxge_free_mbufs(mxge_softc_t *sc)
3114{
3115	int slice;
3116
3117	for (slice = 0; slice < sc->num_slices; slice++)
3118		mxge_free_slice_mbufs(&sc->ss[slice]);
3119}
3120
3121static void
3122mxge_free_slice_rings(struct mxge_slice_state *ss)
3123{
3124	int i;
3125
3126
3127	if (ss->rx_done.entry != NULL)
3128		mxge_dma_free(&ss->rx_done.dma);
3129	ss->rx_done.entry = NULL;
3130
3131	if (ss->tx.req_bytes != NULL)
3132		free(ss->tx.req_bytes, M_DEVBUF);
3133	ss->tx.req_bytes = NULL;
3134
3135	if (ss->tx.seg_list != NULL)
3136		free(ss->tx.seg_list, M_DEVBUF);
3137	ss->tx.seg_list = NULL;
3138
3139	if (ss->rx_small.shadow != NULL)
3140		free(ss->rx_small.shadow, M_DEVBUF);
3141	ss->rx_small.shadow = NULL;
3142
3143	if (ss->rx_big.shadow != NULL)
3144		free(ss->rx_big.shadow, M_DEVBUF);
3145	ss->rx_big.shadow = NULL;
3146
3147	if (ss->tx.info != NULL) {
3148		if (ss->tx.dmat != NULL) {
3149			for (i = 0; i <= ss->tx.mask; i++) {
3150				bus_dmamap_destroy(ss->tx.dmat,
3151						   ss->tx.info[i].map);
3152			}
3153			bus_dma_tag_destroy(ss->tx.dmat);
3154		}
3155		free(ss->tx.info, M_DEVBUF);
3156	}
3157	ss->tx.info = NULL;
3158
3159	if (ss->rx_small.info != NULL) {
3160		if (ss->rx_small.dmat != NULL) {
3161			for (i = 0; i <= ss->rx_small.mask; i++) {
3162				bus_dmamap_destroy(ss->rx_small.dmat,
3163						   ss->rx_small.info[i].map);
3164			}
3165			bus_dmamap_destroy(ss->rx_small.dmat,
3166					   ss->rx_small.extra_map);
3167			bus_dma_tag_destroy(ss->rx_small.dmat);
3168		}
3169		free(ss->rx_small.info, M_DEVBUF);
3170	}
3171	ss->rx_small.info = NULL;
3172
3173	if (ss->rx_big.info != NULL) {
3174		if (ss->rx_big.dmat != NULL) {
3175			for (i = 0; i <= ss->rx_big.mask; i++) {
3176				bus_dmamap_destroy(ss->rx_big.dmat,
3177						   ss->rx_big.info[i].map);
3178			}
3179			bus_dmamap_destroy(ss->rx_big.dmat,
3180					   ss->rx_big.extra_map);
3181			bus_dma_tag_destroy(ss->rx_big.dmat);
3182		}
3183		free(ss->rx_big.info, M_DEVBUF);
3184	}
3185	ss->rx_big.info = NULL;
3186}
3187
3188static void
3189mxge_free_rings(mxge_softc_t *sc)
3190{
3191	int slice;
3192
3193	for (slice = 0; slice < sc->num_slices; slice++)
3194		mxge_free_slice_rings(&sc->ss[slice]);
3195}
3196
3197static int
3198mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3199		       int tx_ring_entries)
3200{
3201	mxge_softc_t *sc = ss->sc;
3202	size_t bytes;
3203	int err, i;
3204
3205	err = ENOMEM;
3206
3207	/* allocate per-slice receive resources */
3208
3209	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3210	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3211
3212	/* allocate the rx shadow rings */
3213	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3214	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3215	if (ss->rx_small.shadow == NULL)
3216		return err;
3217
3218	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3219	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3220	if (ss->rx_big.shadow == NULL)
3221		return err;
3222
3223	/* allocate the rx host info rings */
3224	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3225	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3226	if (ss->rx_small.info == NULL)
3227		return err;
3228
3229	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3230	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3231	if (ss->rx_big.info == NULL)
3232		return err;
3233
3234	/* allocate the rx busdma resources */
3235	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3236				 1,			/* alignment */
3237				 4096,			/* boundary */
3238				 BUS_SPACE_MAXADDR,	/* low */
3239				 BUS_SPACE_MAXADDR,	/* high */
3240				 NULL, NULL,		/* filter */
3241				 MHLEN,			/* maxsize */
3242				 1,			/* num segs */
3243				 MHLEN,			/* maxsegsize */
3244				 BUS_DMA_ALLOCNOW,	/* flags */
3245				 NULL, NULL,		/* lock */
3246				 &ss->rx_small.dmat);	/* tag */
3247	if (err != 0) {
3248		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3249			      err);
3250		return err;
3251	}
3252
3253	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3254				 1,			/* alignment */
3255#if MXGE_VIRT_JUMBOS
3256				 4096,			/* boundary */
3257#else
3258				 0,			/* boundary */
3259#endif
3260				 BUS_SPACE_MAXADDR,	/* low */
3261				 BUS_SPACE_MAXADDR,	/* high */
3262				 NULL, NULL,		/* filter */
3263				 3*4096,		/* maxsize */
3264#if MXGE_VIRT_JUMBOS
3265				 3,			/* num segs */
3266				 4096,			/* maxsegsize*/
3267#else
3268				 1,			/* num segs */
3269				 MJUM9BYTES,		/* maxsegsize*/
3270#endif
3271				 BUS_DMA_ALLOCNOW,	/* flags */
3272				 NULL, NULL,		/* lock */
3273				 &ss->rx_big.dmat);	/* tag */
3274	if (err != 0) {
3275		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3276			      err);
3277		return err;
3278	}
3279	for (i = 0; i <= ss->rx_small.mask; i++) {
3280		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3281					&ss->rx_small.info[i].map);
3282		if (err != 0) {
3283			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3284				      err);
3285			return err;
3286		}
3287	}
3288	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3289				&ss->rx_small.extra_map);
3290	if (err != 0) {
3291		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3292			      err);
3293		return err;
3294	}
3295
3296	for (i = 0; i <= ss->rx_big.mask; i++) {
3297		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3298					&ss->rx_big.info[i].map);
3299		if (err != 0) {
3300			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3301				      err);
3302			return err;
3303		}
3304	}
3305	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3306				&ss->rx_big.extra_map);
3307	if (err != 0) {
3308		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3309			      err);
3310		return err;
3311	}
3312
3313	/* now allocate TX resouces */
3314
3315#ifndef IFNET_BUF_RING
3316	/* only use a single TX ring for now */
3317	if (ss != ss->sc->ss)
3318		return 0;
3319#endif
3320
3321	ss->tx.mask = tx_ring_entries - 1;
3322	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3323
3324
3325	/* allocate the tx request copy block */
3326	bytes = 8 +
3327		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3328	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3329	if (ss->tx.req_bytes == NULL)
3330		return err;
3331	/* ensure req_list entries are aligned to 8 bytes */
3332	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3333		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3334
3335	/* allocate the tx busdma segment list */
3336	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3337	ss->tx.seg_list = (bus_dma_segment_t *)
3338		malloc(bytes, M_DEVBUF, M_WAITOK);
3339	if (ss->tx.seg_list == NULL)
3340		return err;
3341
3342	/* allocate the tx host info ring */
3343	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3344	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3345	if (ss->tx.info == NULL)
3346		return err;
3347
3348	/* allocate the tx busdma resources */
3349	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3350				 1,			/* alignment */
3351				 sc->tx_boundary,	/* boundary */
3352				 BUS_SPACE_MAXADDR,	/* low */
3353				 BUS_SPACE_MAXADDR,	/* high */
3354				 NULL, NULL,		/* filter */
3355				 65536 + 256,		/* maxsize */
3356				 ss->tx.max_desc - 2,	/* num segs */
3357				 sc->tx_boundary,	/* maxsegsz */
3358				 BUS_DMA_ALLOCNOW,	/* flags */
3359				 NULL, NULL,		/* lock */
3360				 &ss->tx.dmat);		/* tag */
3361
3362	if (err != 0) {
3363		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3364			      err);
3365		return err;
3366	}
3367
3368	/* now use these tags to setup dmamaps for each slot
3369	   in the ring */
3370	for (i = 0; i <= ss->tx.mask; i++) {
3371		err = bus_dmamap_create(ss->tx.dmat, 0,
3372					&ss->tx.info[i].map);
3373		if (err != 0) {
3374			device_printf(sc->dev, "Err %d  tx dmamap\n",
3375				      err);
3376			return err;
3377		}
3378	}
3379	return 0;
3380
3381}
3382
3383static int
3384mxge_alloc_rings(mxge_softc_t *sc)
3385{
3386	mxge_cmd_t cmd;
3387	int tx_ring_size;
3388	int tx_ring_entries, rx_ring_entries;
3389	int err, slice;
3390
3391	/* get ring sizes */
3392	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3393	tx_ring_size = cmd.data0;
3394	if (err != 0) {
3395		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3396		goto abort;
3397	}
3398
3399	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3400	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3401	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3402	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3403	IFQ_SET_READY(&sc->ifp->if_snd);
3404
3405	for (slice = 0; slice < sc->num_slices; slice++) {
3406		err = mxge_alloc_slice_rings(&sc->ss[slice],
3407					     rx_ring_entries,
3408					     tx_ring_entries);
3409		if (err != 0)
3410			goto abort;
3411	}
3412	return 0;
3413
3414abort:
3415	mxge_free_rings(sc);
3416	return err;
3417
3418}
3419
3420
3421static void
3422mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3423{
3424	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3425
3426	if (bufsize < MCLBYTES) {
3427		/* easy, everything fits in a single buffer */
3428		*big_buf_size = MCLBYTES;
3429		*cl_size = MCLBYTES;
3430		*nbufs = 1;
3431		return;
3432	}
3433
3434	if (bufsize < MJUMPAGESIZE) {
3435		/* still easy, everything still fits in a single buffer */
3436		*big_buf_size = MJUMPAGESIZE;
3437		*cl_size = MJUMPAGESIZE;
3438		*nbufs = 1;
3439		return;
3440	}
3441#if MXGE_VIRT_JUMBOS
3442	/* now we need to use virtually contiguous buffers */
3443	*cl_size = MJUM9BYTES;
3444	*big_buf_size = 4096;
3445	*nbufs = mtu / 4096 + 1;
3446	/* needs to be a power of two, so round up */
3447	if (*nbufs == 3)
3448		*nbufs = 4;
3449#else
3450	*cl_size = MJUM9BYTES;
3451	*big_buf_size = MJUM9BYTES;
3452	*nbufs = 1;
3453#endif
3454}
3455
3456static int
3457mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3458{
3459	mxge_softc_t *sc;
3460	mxge_cmd_t cmd;
3461	bus_dmamap_t map;
3462	struct lro_entry *lro_entry;
3463	int err, i, slice;
3464
3465
3466	sc = ss->sc;
3467	slice = ss - sc->ss;
3468
3469	SLIST_INIT(&ss->lro_free);
3470	SLIST_INIT(&ss->lro_active);
3471
3472	for (i = 0; i < sc->lro_cnt; i++) {
3473		lro_entry = (struct lro_entry *)
3474			malloc(sizeof (*lro_entry), M_DEVBUF,
3475			       M_NOWAIT | M_ZERO);
3476		if (lro_entry == NULL) {
3477			sc->lro_cnt = i;
3478			break;
3479		}
3480		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3481	}
3482	/* get the lanai pointers to the send and receive rings */
3483
3484	err = 0;
3485#ifndef IFNET_BUF_RING
3486	/* We currently only send from the first slice */
3487	if (slice == 0) {
3488#endif
3489		cmd.data0 = slice;
3490		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3491		ss->tx.lanai =
3492			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3493		ss->tx.send_go = (volatile uint32_t *)
3494			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3495		ss->tx.send_stop = (volatile uint32_t *)
3496		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3497#ifndef IFNET_BUF_RING
3498	}
3499#endif
3500	cmd.data0 = slice;
3501	err |= mxge_send_cmd(sc,
3502			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3503	ss->rx_small.lanai =
3504		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3505	cmd.data0 = slice;
3506	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3507	ss->rx_big.lanai =
3508		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3509
3510	if (err != 0) {
3511		device_printf(sc->dev,
3512			      "failed to get ring sizes or locations\n");
3513		return EIO;
3514	}
3515
3516	/* stock receive rings */
3517	for (i = 0; i <= ss->rx_small.mask; i++) {
3518		map = ss->rx_small.info[i].map;
3519		err = mxge_get_buf_small(ss, map, i);
3520		if (err) {
3521			device_printf(sc->dev, "alloced %d/%d smalls\n",
3522				      i, ss->rx_small.mask + 1);
3523			return ENOMEM;
3524		}
3525	}
3526	for (i = 0; i <= ss->rx_big.mask; i++) {
3527		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3528		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3529	}
3530	ss->rx_big.nbufs = nbufs;
3531	ss->rx_big.cl_size = cl_size;
3532	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3533		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3534	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3535		map = ss->rx_big.info[i].map;
3536		err = mxge_get_buf_big(ss, map, i);
3537		if (err) {
3538			device_printf(sc->dev, "alloced %d/%d bigs\n",
3539				      i, ss->rx_big.mask + 1);
3540			return ENOMEM;
3541		}
3542	}
3543	return 0;
3544}
3545
3546static int
3547mxge_open(mxge_softc_t *sc)
3548{
3549	mxge_cmd_t cmd;
3550	int err, big_bytes, nbufs, slice, cl_size, i;
3551	bus_addr_t bus;
3552	volatile uint8_t *itable;
3553	struct mxge_slice_state *ss;
3554
3555	/* Copy the MAC address in case it was overridden */
3556	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3557
3558	err = mxge_reset(sc, 1);
3559	if (err != 0) {
3560		device_printf(sc->dev, "failed to reset\n");
3561		return EIO;
3562	}
3563
3564	if (sc->num_slices > 1) {
3565		/* setup the indirection table */
3566		cmd.data0 = sc->num_slices;
3567		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3568				    &cmd);
3569
3570		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3571				     &cmd);
3572		if (err != 0) {
3573			device_printf(sc->dev,
3574				      "failed to setup rss tables\n");
3575			return err;
3576		}
3577
3578		/* just enable an identity mapping */
3579		itable = sc->sram + cmd.data0;
3580		for (i = 0; i < sc->num_slices; i++)
3581			itable[i] = (uint8_t)i;
3582
3583		cmd.data0 = 1;
3584		cmd.data1 = mxge_rss_hash_type;
3585		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3586		if (err != 0) {
3587			device_printf(sc->dev, "failed to enable slices\n");
3588			return err;
3589		}
3590	}
3591
3592
3593	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3594
3595	cmd.data0 = nbufs;
3596	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3597			    &cmd);
3598	/* error is only meaningful if we're trying to set
3599	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3600	if (err && nbufs > 1) {
3601		device_printf(sc->dev,
3602			      "Failed to set alway-use-n to %d\n",
3603			      nbufs);
3604		return EIO;
3605	}
3606	/* Give the firmware the mtu and the big and small buffer
3607	   sizes.  The firmware wants the big buf size to be a power
3608	   of two. Luckily, FreeBSD's clusters are powers of two */
3609	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3610	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3611	cmd.data0 = MHLEN - MXGEFW_PAD;
3612	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3613			     &cmd);
3614	cmd.data0 = big_bytes;
3615	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3616
3617	if (err != 0) {
3618		device_printf(sc->dev, "failed to setup params\n");
3619		goto abort;
3620	}
3621
3622	/* Now give him the pointer to the stats block */
3623	for (slice = 0;
3624#ifdef IFNET_BUF_RING
3625	     slice < sc->num_slices;
3626#else
3627	     slice < 1;
3628#endif
3629	     slice++) {
3630		ss = &sc->ss[slice];
3631		cmd.data0 =
3632			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3633		cmd.data1 =
3634			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3635		cmd.data2 = sizeof(struct mcp_irq_data);
3636		cmd.data2 |= (slice << 16);
3637		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3638	}
3639
3640	if (err != 0) {
3641		bus = sc->ss->fw_stats_dma.bus_addr;
3642		bus += offsetof(struct mcp_irq_data, send_done_count);
3643		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3644		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3645		err = mxge_send_cmd(sc,
3646				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3647				    &cmd);
3648		/* Firmware cannot support multicast without STATS_DMA_V2 */
3649		sc->fw_multicast_support = 0;
3650	} else {
3651		sc->fw_multicast_support = 1;
3652	}
3653
3654	if (err != 0) {
3655		device_printf(sc->dev, "failed to setup params\n");
3656		goto abort;
3657	}
3658
3659	for (slice = 0; slice < sc->num_slices; slice++) {
3660		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3661		if (err != 0) {
3662			device_printf(sc->dev, "couldn't open slice %d\n",
3663				      slice);
3664			goto abort;
3665		}
3666	}
3667
3668	/* Finally, start the firmware running */
3669	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3670	if (err) {
3671		device_printf(sc->dev, "Couldn't bring up link\n");
3672		goto abort;
3673	}
3674#ifdef IFNET_BUF_RING
3675	for (slice = 0; slice < sc->num_slices; slice++) {
3676		ss = &sc->ss[slice];
3677		ss->if_drv_flags |= IFF_DRV_RUNNING;
3678		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3679	}
3680#endif
3681	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3682	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3683
3684	return 0;
3685
3686
3687abort:
3688	mxge_free_mbufs(sc);
3689
3690	return err;
3691}
3692
3693static int
3694mxge_close(mxge_softc_t *sc, int down)
3695{
3696	mxge_cmd_t cmd;
3697	int err, old_down_cnt;
3698#ifdef IFNET_BUF_RING
3699	struct mxge_slice_state *ss;
3700	int slice;
3701#endif
3702
3703#ifdef IFNET_BUF_RING
3704	for (slice = 0; slice < sc->num_slices; slice++) {
3705		ss = &sc->ss[slice];
3706		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3707	}
3708#endif
3709	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3710	if (!down) {
3711		old_down_cnt = sc->down_cnt;
3712		wmb();
3713		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3714		if (err) {
3715			device_printf(sc->dev,
3716				      "Couldn't bring down link\n");
3717		}
3718		if (old_down_cnt == sc->down_cnt) {
3719			/* wait for down irq */
3720			DELAY(10 * sc->intr_coal_delay);
3721		}
3722		wmb();
3723		if (old_down_cnt == sc->down_cnt) {
3724			device_printf(sc->dev, "never got down irq\n");
3725		}
3726	}
3727	mxge_free_mbufs(sc);
3728
3729	return 0;
3730}
3731
3732static void
3733mxge_setup_cfg_space(mxge_softc_t *sc)
3734{
3735	device_t dev = sc->dev;
3736	int reg;
3737	uint16_t cmd, lnk, pectl;
3738
3739	/* find the PCIe link width and set max read request to 4KB*/
3740	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3741		lnk = pci_read_config(dev, reg + 0x12, 2);
3742		sc->link_width = (lnk >> 4) & 0x3f;
3743
3744		if (sc->pectl == 0) {
3745			pectl = pci_read_config(dev, reg + 0x8, 2);
3746			pectl = (pectl & ~0x7000) | (5 << 12);
3747			pci_write_config(dev, reg + 0x8, pectl, 2);
3748			sc->pectl = pectl;
3749		} else {
3750			/* restore saved pectl after watchdog reset */
3751			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3752		}
3753	}
3754
3755	/* Enable DMA and Memory space access */
3756	pci_enable_busmaster(dev);
3757	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3758	cmd |= PCIM_CMD_MEMEN;
3759	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3760}
3761
3762static uint32_t
3763mxge_read_reboot(mxge_softc_t *sc)
3764{
3765	device_t dev = sc->dev;
3766	uint32_t vs;
3767
3768	/* find the vendor specific offset */
3769	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3770		device_printf(sc->dev,
3771			      "could not find vendor specific offset\n");
3772		return (uint32_t)-1;
3773	}
3774	/* enable read32 mode */
3775	pci_write_config(dev, vs + 0x10, 0x3, 1);
3776	/* tell NIC which register to read */
3777	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3778	return (pci_read_config(dev, vs + 0x14, 4));
3779}
3780
3781static void
3782mxge_watchdog_reset(mxge_softc_t *sc)
3783{
3784	struct pci_devinfo *dinfo;
3785	struct mxge_slice_state *ss;
3786	int err, running, s, num_tx_slices = 1;
3787	uint32_t reboot;
3788	uint16_t cmd;
3789
3790	err = ENXIO;
3791
3792	device_printf(sc->dev, "Watchdog reset!\n");
3793
3794	/*
3795	 * check to see if the NIC rebooted.  If it did, then all of
3796	 * PCI config space has been reset, and things like the
3797	 * busmaster bit will be zero.  If this is the case, then we
3798	 * must restore PCI config space before the NIC can be used
3799	 * again
3800	 */
3801	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3802	if (cmd == 0xffff) {
3803		/*
3804		 * maybe the watchdog caught the NIC rebooting; wait
3805		 * up to 100ms for it to finish.  If it does not come
3806		 * back, then give up
3807		 */
3808		DELAY(1000*100);
3809		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3810		if (cmd == 0xffff) {
3811			device_printf(sc->dev, "NIC disappeared!\n");
3812		}
3813	}
3814	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3815		/* print the reboot status */
3816		reboot = mxge_read_reboot(sc);
3817		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3818			      reboot);
3819		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3820		if (running) {
3821
3822			/*
3823			 * quiesce NIC so that TX routines will not try to
3824			 * xmit after restoration of BAR
3825			 */
3826
3827			/* Mark the link as down */
3828			if (sc->link_state) {
3829				sc->link_state = 0;
3830				if_link_state_change(sc->ifp,
3831						     LINK_STATE_DOWN);
3832			}
3833#ifdef IFNET_BUF_RING
3834			num_tx_slices = sc->num_slices;
3835#endif
3836			/* grab all TX locks to ensure no tx  */
3837			for (s = 0; s < num_tx_slices; s++) {
3838				ss = &sc->ss[s];
3839				mtx_lock(&ss->tx.mtx);
3840			}
3841			mxge_close(sc, 1);
3842		}
3843		/* restore PCI configuration space */
3844		dinfo = device_get_ivars(sc->dev);
3845		pci_cfg_restore(sc->dev, dinfo);
3846
3847		/* and redo any changes we made to our config space */
3848		mxge_setup_cfg_space(sc);
3849
3850		/* reload f/w */
3851		err = mxge_load_firmware(sc, 0);
3852		if (err) {
3853			device_printf(sc->dev,
3854				      "Unable to re-load f/w\n");
3855		}
3856		if (running) {
3857			if (!err)
3858				err = mxge_open(sc);
3859			/* release all TX locks */
3860			for (s = 0; s < num_tx_slices; s++) {
3861				ss = &sc->ss[s];
3862#ifdef IFNET_BUF_RING
3863				mxge_start_locked(ss);
3864#endif
3865				mtx_unlock(&ss->tx.mtx);
3866			}
3867		}
3868		sc->watchdog_resets++;
3869	} else {
3870		device_printf(sc->dev,
3871			      "NIC did not reboot, not resetting\n");
3872		err = 0;
3873	}
3874	if (err) {
3875		device_printf(sc->dev, "watchdog reset failed\n");
3876	} else {
3877		if (sc->dying == 2)
3878			sc->dying = 0;
3879		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3880	}
3881}
3882
3883static void
3884mxge_watchdog_task(void *arg, int pending)
3885{
3886	mxge_softc_t *sc = arg;
3887
3888
3889	mtx_lock(&sc->driver_mtx);
3890	mxge_watchdog_reset(sc);
3891	mtx_unlock(&sc->driver_mtx);
3892}
3893
3894static void
3895mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3896{
3897	tx = &sc->ss[slice].tx;
3898	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3899	device_printf(sc->dev,
3900		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3901		      tx->req, tx->done, tx->queue_active);
3902	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3903			      tx->activate, tx->deactivate);
3904	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3905		      tx->pkt_done,
3906		      be32toh(sc->ss->fw_stats->send_done_count));
3907}
3908
3909static int
3910mxge_watchdog(mxge_softc_t *sc)
3911{
3912	mxge_tx_ring_t *tx;
3913	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3914	int i, err = 0;
3915
3916	/* see if we have outstanding transmits, which
3917	   have been pending for more than mxge_ticks */
3918	for (i = 0;
3919#ifdef IFNET_BUF_RING
3920	     (i < sc->num_slices) && (err == 0);
3921#else
3922	     (i < 1) && (err == 0);
3923#endif
3924	     i++) {
3925		tx = &sc->ss[i].tx;
3926		if (tx->req != tx->done &&
3927		    tx->watchdog_req != tx->watchdog_done &&
3928		    tx->done == tx->watchdog_done) {
3929			/* check for pause blocking before resetting */
3930			if (tx->watchdog_rx_pause == rx_pause) {
3931				mxge_warn_stuck(sc, tx, i);
3932				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3933				return (ENXIO);
3934			}
3935			else
3936				device_printf(sc->dev, "Flow control blocking "
3937					      "xmits, check link partner\n");
3938		}
3939
3940		tx->watchdog_req = tx->req;
3941		tx->watchdog_done = tx->done;
3942		tx->watchdog_rx_pause = rx_pause;
3943	}
3944
3945	if (sc->need_media_probe)
3946		mxge_media_probe(sc);
3947	return (err);
3948}
3949
3950static u_long
3951mxge_update_stats(mxge_softc_t *sc)
3952{
3953	struct mxge_slice_state *ss;
3954	u_long pkts = 0;
3955	u_long ipackets = 0;
3956	u_long opackets = 0;
3957#ifdef IFNET_BUF_RING
3958	u_long obytes = 0;
3959	u_long omcasts = 0;
3960	u_long odrops = 0;
3961#endif
3962	u_long oerrors = 0;
3963	int slice;
3964
3965	for (slice = 0; slice < sc->num_slices; slice++) {
3966		ss = &sc->ss[slice];
3967		ipackets += ss->ipackets;
3968		opackets += ss->opackets;
3969#ifdef IFNET_BUF_RING
3970		obytes += ss->obytes;
3971		omcasts += ss->omcasts;
3972		odrops += ss->tx.br->br_drops;
3973#endif
3974		oerrors += ss->oerrors;
3975	}
3976	pkts = (ipackets - sc->ifp->if_ipackets);
3977	pkts += (opackets - sc->ifp->if_opackets);
3978	sc->ifp->if_ipackets = ipackets;
3979	sc->ifp->if_opackets = opackets;
3980#ifdef IFNET_BUF_RING
3981	sc->ifp->if_obytes = obytes;
3982	sc->ifp->if_omcasts = omcasts;
3983	sc->ifp->if_snd.ifq_drops = odrops;
3984#endif
3985	sc->ifp->if_oerrors = oerrors;
3986	return pkts;
3987}
3988
3989static void
3990mxge_tick(void *arg)
3991{
3992	mxge_softc_t *sc = arg;
3993	u_long pkts = 0;
3994	int err = 0;
3995	int running, ticks;
3996	uint16_t cmd;
3997
3998	ticks = mxge_ticks;
3999	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4000	if (running) {
4001		/* aggregate stats from different slices */
4002		pkts = mxge_update_stats(sc);
4003		if (!sc->watchdog_countdown) {
4004			err = mxge_watchdog(sc);
4005			sc->watchdog_countdown = 4;
4006		}
4007		sc->watchdog_countdown--;
4008	}
4009	if (pkts == 0) {
4010		/* ensure NIC did not suffer h/w fault while idle */
4011		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4012		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4013			sc->dying = 2;
4014			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4015			err = ENXIO;
4016		}
4017		/* look less often if NIC is idle */
4018		ticks *= 4;
4019	}
4020
4021	if (err == 0)
4022		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4023
4024}
4025
4026static int
4027mxge_media_change(struct ifnet *ifp)
4028{
4029	return EINVAL;
4030}
4031
4032static int
4033mxge_change_mtu(mxge_softc_t *sc, int mtu)
4034{
4035	struct ifnet *ifp = sc->ifp;
4036	int real_mtu, old_mtu;
4037	int err = 0;
4038
4039
4040	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4041	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4042		return EINVAL;
4043	mtx_lock(&sc->driver_mtx);
4044	old_mtu = ifp->if_mtu;
4045	ifp->if_mtu = mtu;
4046	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4047		mxge_close(sc, 0);
4048		err = mxge_open(sc);
4049		if (err != 0) {
4050			ifp->if_mtu = old_mtu;
4051			mxge_close(sc, 0);
4052			(void) mxge_open(sc);
4053		}
4054	}
4055	mtx_unlock(&sc->driver_mtx);
4056	return err;
4057}
4058
4059static void
4060mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4061{
4062	mxge_softc_t *sc = ifp->if_softc;
4063
4064
4065	if (sc == NULL)
4066		return;
4067	ifmr->ifm_status = IFM_AVALID;
4068	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4069	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4070	ifmr->ifm_active |= sc->current_media;
4071}
4072
4073static int
4074mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4075{
4076	mxge_softc_t *sc = ifp->if_softc;
4077	struct ifreq *ifr = (struct ifreq *)data;
4078	int err, mask;
4079
4080	err = 0;
4081	switch (command) {
4082	case SIOCSIFADDR:
4083	case SIOCGIFADDR:
4084		err = ether_ioctl(ifp, command, data);
4085		break;
4086
4087	case SIOCSIFMTU:
4088		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4089		break;
4090
4091	case SIOCSIFFLAGS:
4092		mtx_lock(&sc->driver_mtx);
4093		if (sc->dying) {
4094			mtx_unlock(&sc->driver_mtx);
4095			return EINVAL;
4096		}
4097		if (ifp->if_flags & IFF_UP) {
4098			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4099				err = mxge_open(sc);
4100			} else {
4101				/* take care of promis can allmulti
4102				   flag chages */
4103				mxge_change_promisc(sc,
4104						    ifp->if_flags & IFF_PROMISC);
4105				mxge_set_multicast_list(sc);
4106			}
4107		} else {
4108			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4109				mxge_close(sc, 0);
4110			}
4111		}
4112		mtx_unlock(&sc->driver_mtx);
4113		break;
4114
4115	case SIOCADDMULTI:
4116	case SIOCDELMULTI:
4117		mtx_lock(&sc->driver_mtx);
4118		mxge_set_multicast_list(sc);
4119		mtx_unlock(&sc->driver_mtx);
4120		break;
4121
4122	case SIOCSIFCAP:
4123		mtx_lock(&sc->driver_mtx);
4124		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4125		if (mask & IFCAP_TXCSUM) {
4126			if (IFCAP_TXCSUM & ifp->if_capenable) {
4127				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4128				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4129						      | CSUM_TSO);
4130			} else {
4131				ifp->if_capenable |= IFCAP_TXCSUM;
4132				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4133			}
4134		} else if (mask & IFCAP_RXCSUM) {
4135			if (IFCAP_RXCSUM & ifp->if_capenable) {
4136				ifp->if_capenable &= ~IFCAP_RXCSUM;
4137				sc->csum_flag = 0;
4138			} else {
4139				ifp->if_capenable |= IFCAP_RXCSUM;
4140				sc->csum_flag = 1;
4141			}
4142		}
4143		if (mask & IFCAP_TSO4) {
4144			if (IFCAP_TSO4 & ifp->if_capenable) {
4145				ifp->if_capenable &= ~IFCAP_TSO4;
4146				ifp->if_hwassist &= ~CSUM_TSO;
4147			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4148				ifp->if_capenable |= IFCAP_TSO4;
4149				ifp->if_hwassist |= CSUM_TSO;
4150			} else {
4151				printf("mxge requires tx checksum offload"
4152				       " be enabled to use TSO\n");
4153				err = EINVAL;
4154			}
4155		}
4156		if (mask & IFCAP_LRO) {
4157			if (IFCAP_LRO & ifp->if_capenable)
4158				err = mxge_change_lro_locked(sc, 0);
4159			else
4160				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4161		}
4162		if (mask & IFCAP_VLAN_HWTAGGING)
4163			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4164		if (mask & IFCAP_VLAN_HWTSO)
4165			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4166
4167		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4168		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4169			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4170
4171		mtx_unlock(&sc->driver_mtx);
4172		VLAN_CAPABILITIES(ifp);
4173
4174		break;
4175
4176	case SIOCGIFMEDIA:
4177		mtx_lock(&sc->driver_mtx);
4178		mxge_media_probe(sc);
4179		mtx_unlock(&sc->driver_mtx);
4180		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4181				    &sc->media, command);
4182                break;
4183
4184	default:
4185		err = ENOTTY;
4186        }
4187	return err;
4188}
4189
4190static void
4191mxge_fetch_tunables(mxge_softc_t *sc)
4192{
4193
4194	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4195	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4196			  &mxge_flow_control);
4197	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4198			  &mxge_intr_coal_delay);
4199	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4200			  &mxge_nvidia_ecrc_enable);
4201	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4202			  &mxge_force_firmware);
4203	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4204			  &mxge_deassert_wait);
4205	TUNABLE_INT_FETCH("hw.mxge.verbose",
4206			  &mxge_verbose);
4207	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4208	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4209	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4210	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4211	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4212	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4213	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4214	if (sc->lro_cnt != 0)
4215		mxge_lro_cnt = sc->lro_cnt;
4216
4217	if (bootverbose)
4218		mxge_verbose = 1;
4219	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4220		mxge_intr_coal_delay = 30;
4221	if (mxge_ticks == 0)
4222		mxge_ticks = hz / 2;
4223	sc->pause = mxge_flow_control;
4224	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4225	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4226		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4227	}
4228	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4229	    mxge_initial_mtu < ETHER_MIN_LEN)
4230		mxge_initial_mtu = ETHERMTU_JUMBO;
4231
4232	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4233		mxge_throttle = MXGE_MAX_THROTTLE;
4234	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4235		mxge_throttle = MXGE_MIN_THROTTLE;
4236	sc->throttle = mxge_throttle;
4237}
4238
4239
4240static void
4241mxge_free_slices(mxge_softc_t *sc)
4242{
4243	struct mxge_slice_state *ss;
4244	int i;
4245
4246
4247	if (sc->ss == NULL)
4248		return;
4249
4250	for (i = 0; i < sc->num_slices; i++) {
4251		ss = &sc->ss[i];
4252		if (ss->fw_stats != NULL) {
4253			mxge_dma_free(&ss->fw_stats_dma);
4254			ss->fw_stats = NULL;
4255#ifdef IFNET_BUF_RING
4256			if (ss->tx.br != NULL) {
4257				drbr_free(ss->tx.br, M_DEVBUF);
4258				ss->tx.br = NULL;
4259			}
4260#endif
4261			mtx_destroy(&ss->tx.mtx);
4262		}
4263		if (ss->rx_done.entry != NULL) {
4264			mxge_dma_free(&ss->rx_done.dma);
4265			ss->rx_done.entry = NULL;
4266		}
4267	}
4268	free(sc->ss, M_DEVBUF);
4269	sc->ss = NULL;
4270}
4271
4272static int
4273mxge_alloc_slices(mxge_softc_t *sc)
4274{
4275	mxge_cmd_t cmd;
4276	struct mxge_slice_state *ss;
4277	size_t bytes;
4278	int err, i, max_intr_slots;
4279
4280	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4281	if (err != 0) {
4282		device_printf(sc->dev, "Cannot determine rx ring size\n");
4283		return err;
4284	}
4285	sc->rx_ring_size = cmd.data0;
4286	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4287
4288	bytes = sizeof (*sc->ss) * sc->num_slices;
4289	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4290	if (sc->ss == NULL)
4291		return (ENOMEM);
4292	for (i = 0; i < sc->num_slices; i++) {
4293		ss = &sc->ss[i];
4294
4295		ss->sc = sc;
4296
4297		/* allocate per-slice rx interrupt queues */
4298
4299		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4300		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4301		if (err != 0)
4302			goto abort;
4303		ss->rx_done.entry = ss->rx_done.dma.addr;
4304		bzero(ss->rx_done.entry, bytes);
4305
4306		/*
4307		 * allocate the per-slice firmware stats; stats
4308		 * (including tx) are used used only on the first
4309		 * slice for now
4310		 */
4311#ifndef IFNET_BUF_RING
4312		if (i > 0)
4313			continue;
4314#endif
4315
4316		bytes = sizeof (*ss->fw_stats);
4317		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4318				     sizeof (*ss->fw_stats), 64);
4319		if (err != 0)
4320			goto abort;
4321		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4322		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4323			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4324		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4325#ifdef IFNET_BUF_RING
4326		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4327					   &ss->tx.mtx);
4328#endif
4329	}
4330
4331	return (0);
4332
4333abort:
4334	mxge_free_slices(sc);
4335	return (ENOMEM);
4336}
4337
4338static void
4339mxge_slice_probe(mxge_softc_t *sc)
4340{
4341	mxge_cmd_t cmd;
4342	char *old_fw;
4343	int msix_cnt, status, max_intr_slots;
4344
4345	sc->num_slices = 1;
4346	/*
4347	 *  don't enable multiple slices if they are not enabled,
4348	 *  or if this is not an SMP system
4349	 */
4350
4351	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4352		return;
4353
4354	/* see how many MSI-X interrupts are available */
4355	msix_cnt = pci_msix_count(sc->dev);
4356	if (msix_cnt < 2)
4357		return;
4358
4359	/* now load the slice aware firmware see what it supports */
4360	old_fw = sc->fw_name;
4361	if (old_fw == mxge_fw_aligned)
4362		sc->fw_name = mxge_fw_rss_aligned;
4363	else
4364		sc->fw_name = mxge_fw_rss_unaligned;
4365	status = mxge_load_firmware(sc, 0);
4366	if (status != 0) {
4367		device_printf(sc->dev, "Falling back to a single slice\n");
4368		return;
4369	}
4370
4371	/* try to send a reset command to the card to see if it
4372	   is alive */
4373	memset(&cmd, 0, sizeof (cmd));
4374	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4375	if (status != 0) {
4376		device_printf(sc->dev, "failed reset\n");
4377		goto abort_with_fw;
4378	}
4379
4380	/* get rx ring size */
4381	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4382	if (status != 0) {
4383		device_printf(sc->dev, "Cannot determine rx ring size\n");
4384		goto abort_with_fw;
4385	}
4386	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4387
4388	/* tell it the size of the interrupt queues */
4389	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4390	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4391	if (status != 0) {
4392		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4393		goto abort_with_fw;
4394	}
4395
4396	/* ask the maximum number of slices it supports */
4397	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4398	if (status != 0) {
4399		device_printf(sc->dev,
4400			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4401		goto abort_with_fw;
4402	}
4403	sc->num_slices = cmd.data0;
4404	if (sc->num_slices > msix_cnt)
4405		sc->num_slices = msix_cnt;
4406
4407	if (mxge_max_slices == -1) {
4408		/* cap to number of CPUs in system */
4409		if (sc->num_slices > mp_ncpus)
4410			sc->num_slices = mp_ncpus;
4411	} else {
4412		if (sc->num_slices > mxge_max_slices)
4413			sc->num_slices = mxge_max_slices;
4414	}
4415	/* make sure it is a power of two */
4416	while (sc->num_slices & (sc->num_slices - 1))
4417		sc->num_slices--;
4418
4419	if (mxge_verbose)
4420		device_printf(sc->dev, "using %d slices\n",
4421			      sc->num_slices);
4422
4423	return;
4424
4425abort_with_fw:
4426	sc->fw_name = old_fw;
4427	(void) mxge_load_firmware(sc, 0);
4428}
4429
4430static int
4431mxge_add_msix_irqs(mxge_softc_t *sc)
4432{
4433	size_t bytes;
4434	int count, err, i, rid;
4435
4436	rid = PCIR_BAR(2);
4437	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4438						    &rid, RF_ACTIVE);
4439
4440	if (sc->msix_table_res == NULL) {
4441		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4442		return ENXIO;
4443	}
4444
4445	count = sc->num_slices;
4446	err = pci_alloc_msix(sc->dev, &count);
4447	if (err != 0) {
4448		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4449			      "err = %d \n", sc->num_slices, err);
4450		goto abort_with_msix_table;
4451	}
4452	if (count < sc->num_slices) {
4453		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4454			      count, sc->num_slices);
4455		device_printf(sc->dev,
4456			      "Try setting hw.mxge.max_slices to %d\n",
4457			      count);
4458		err = ENOSPC;
4459		goto abort_with_msix;
4460	}
4461	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4462	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4463	if (sc->msix_irq_res == NULL) {
4464		err = ENOMEM;
4465		goto abort_with_msix;
4466	}
4467
4468	for (i = 0; i < sc->num_slices; i++) {
4469		rid = i + 1;
4470		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4471							  SYS_RES_IRQ,
4472							  &rid, RF_ACTIVE);
4473		if (sc->msix_irq_res[i] == NULL) {
4474			device_printf(sc->dev, "couldn't allocate IRQ res"
4475				      " for message %d\n", i);
4476			err = ENXIO;
4477			goto abort_with_res;
4478		}
4479	}
4480
4481	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4482	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4483
4484	for (i = 0; i < sc->num_slices; i++) {
4485		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4486				     INTR_TYPE_NET | INTR_MPSAFE,
4487#if __FreeBSD_version > 700030
4488				     NULL,
4489#endif
4490				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4491		if (err != 0) {
4492			device_printf(sc->dev, "couldn't setup intr for "
4493				      "message %d\n", i);
4494			goto abort_with_intr;
4495		}
4496		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4497				  sc->msix_ih[i], "s%d", i);
4498	}
4499
4500	if (mxge_verbose) {
4501		device_printf(sc->dev, "using %d msix IRQs:",
4502			      sc->num_slices);
4503		for (i = 0; i < sc->num_slices; i++)
4504			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4505		printf("\n");
4506	}
4507	return (0);
4508
4509abort_with_intr:
4510	for (i = 0; i < sc->num_slices; i++) {
4511		if (sc->msix_ih[i] != NULL) {
4512			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4513					  sc->msix_ih[i]);
4514			sc->msix_ih[i] = NULL;
4515		}
4516	}
4517	free(sc->msix_ih, M_DEVBUF);
4518
4519
4520abort_with_res:
4521	for (i = 0; i < sc->num_slices; i++) {
4522		rid = i + 1;
4523		if (sc->msix_irq_res[i] != NULL)
4524			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4525					     sc->msix_irq_res[i]);
4526		sc->msix_irq_res[i] = NULL;
4527	}
4528	free(sc->msix_irq_res, M_DEVBUF);
4529
4530
4531abort_with_msix:
4532	pci_release_msi(sc->dev);
4533
4534abort_with_msix_table:
4535	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4536			     sc->msix_table_res);
4537
4538	return err;
4539}
4540
4541static int
4542mxge_add_single_irq(mxge_softc_t *sc)
4543{
4544	int count, err, rid;
4545
4546	count = pci_msi_count(sc->dev);
4547	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4548		rid = 1;
4549	} else {
4550		rid = 0;
4551		sc->legacy_irq = 1;
4552	}
4553	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4554					 1, RF_SHAREABLE | RF_ACTIVE);
4555	if (sc->irq_res == NULL) {
4556		device_printf(sc->dev, "could not alloc interrupt\n");
4557		return ENXIO;
4558	}
4559	if (mxge_verbose)
4560		device_printf(sc->dev, "using %s irq %ld\n",
4561			      sc->legacy_irq ? "INTx" : "MSI",
4562			      rman_get_start(sc->irq_res));
4563	err = bus_setup_intr(sc->dev, sc->irq_res,
4564			     INTR_TYPE_NET | INTR_MPSAFE,
4565#if __FreeBSD_version > 700030
4566			     NULL,
4567#endif
4568			     mxge_intr, &sc->ss[0], &sc->ih);
4569	if (err != 0) {
4570		bus_release_resource(sc->dev, SYS_RES_IRQ,
4571				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4572		if (!sc->legacy_irq)
4573			pci_release_msi(sc->dev);
4574	}
4575	return err;
4576}
4577
4578static void
4579mxge_rem_msix_irqs(mxge_softc_t *sc)
4580{
4581	int i, rid;
4582
4583	for (i = 0; i < sc->num_slices; i++) {
4584		if (sc->msix_ih[i] != NULL) {
4585			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4586					  sc->msix_ih[i]);
4587			sc->msix_ih[i] = NULL;
4588		}
4589	}
4590	free(sc->msix_ih, M_DEVBUF);
4591
4592	for (i = 0; i < sc->num_slices; i++) {
4593		rid = i + 1;
4594		if (sc->msix_irq_res[i] != NULL)
4595			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4596					     sc->msix_irq_res[i]);
4597		sc->msix_irq_res[i] = NULL;
4598	}
4599	free(sc->msix_irq_res, M_DEVBUF);
4600
4601	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4602			     sc->msix_table_res);
4603
4604	pci_release_msi(sc->dev);
4605	return;
4606}
4607
4608static void
4609mxge_rem_single_irq(mxge_softc_t *sc)
4610{
4611	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4612	bus_release_resource(sc->dev, SYS_RES_IRQ,
4613			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4614	if (!sc->legacy_irq)
4615		pci_release_msi(sc->dev);
4616}
4617
4618static void
4619mxge_rem_irq(mxge_softc_t *sc)
4620{
4621	if (sc->num_slices > 1)
4622		mxge_rem_msix_irqs(sc);
4623	else
4624		mxge_rem_single_irq(sc);
4625}
4626
4627static int
4628mxge_add_irq(mxge_softc_t *sc)
4629{
4630	int err;
4631
4632	if (sc->num_slices > 1)
4633		err = mxge_add_msix_irqs(sc);
4634	else
4635		err = mxge_add_single_irq(sc);
4636
4637	if (0 && err == 0 && sc->num_slices > 1) {
4638		mxge_rem_msix_irqs(sc);
4639		err = mxge_add_msix_irqs(sc);
4640	}
4641	return err;
4642}
4643
4644
4645static int
4646mxge_attach(device_t dev)
4647{
4648	mxge_softc_t *sc = device_get_softc(dev);
4649	struct ifnet *ifp;
4650	int err, rid;
4651
4652	sc->dev = dev;
4653	mxge_fetch_tunables(sc);
4654
4655	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4656	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4657				  taskqueue_thread_enqueue, &sc->tq);
4658	if (sc->tq == NULL) {
4659		err = ENOMEM;
4660		goto abort_with_nothing;
4661	}
4662
4663	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4664				 1,			/* alignment */
4665				 0,			/* boundary */
4666				 BUS_SPACE_MAXADDR,	/* low */
4667				 BUS_SPACE_MAXADDR,	/* high */
4668				 NULL, NULL,		/* filter */
4669				 65536 + 256,		/* maxsize */
4670				 MXGE_MAX_SEND_DESC, 	/* num segs */
4671				 65536,			/* maxsegsize */
4672				 0,			/* flags */
4673				 NULL, NULL,		/* lock */
4674				 &sc->parent_dmat);	/* tag */
4675
4676	if (err != 0) {
4677		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4678			      err);
4679		goto abort_with_tq;
4680	}
4681
4682	ifp = sc->ifp = if_alloc(IFT_ETHER);
4683	if (ifp == NULL) {
4684		device_printf(dev, "can not if_alloc()\n");
4685		err = ENOSPC;
4686		goto abort_with_parent_dmat;
4687	}
4688	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4689
4690	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4691		 device_get_nameunit(dev));
4692	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4693	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4694		 "%s:drv", device_get_nameunit(dev));
4695	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4696		 MTX_NETWORK_LOCK, MTX_DEF);
4697
4698	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4699
4700	mxge_setup_cfg_space(sc);
4701
4702	/* Map the board into the kernel */
4703	rid = PCIR_BARS;
4704	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4705					 ~0, 1, RF_ACTIVE);
4706	if (sc->mem_res == NULL) {
4707		device_printf(dev, "could not map memory\n");
4708		err = ENXIO;
4709		goto abort_with_lock;
4710	}
4711	sc->sram = rman_get_virtual(sc->mem_res);
4712	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4713	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4714		device_printf(dev, "impossible memory region size %ld\n",
4715			      rman_get_size(sc->mem_res));
4716		err = ENXIO;
4717		goto abort_with_mem_res;
4718	}
4719
4720	/* make NULL terminated copy of the EEPROM strings section of
4721	   lanai SRAM */
4722	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4723	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4724				rman_get_bushandle(sc->mem_res),
4725				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4726				sc->eeprom_strings,
4727				MXGE_EEPROM_STRINGS_SIZE - 2);
4728	err = mxge_parse_strings(sc);
4729	if (err != 0)
4730		goto abort_with_mem_res;
4731
4732	/* Enable write combining for efficient use of PCIe bus */
4733	mxge_enable_wc(sc);
4734
4735	/* Allocate the out of band dma memory */
4736	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4737			     sizeof (mxge_cmd_t), 64);
4738	if (err != 0)
4739		goto abort_with_mem_res;
4740	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4741	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4742	if (err != 0)
4743		goto abort_with_cmd_dma;
4744
4745	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4746	if (err != 0)
4747		goto abort_with_zeropad_dma;
4748
4749	/* select & load the firmware */
4750	err = mxge_select_firmware(sc);
4751	if (err != 0)
4752		goto abort_with_dmabench;
4753	sc->intr_coal_delay = mxge_intr_coal_delay;
4754
4755	mxge_slice_probe(sc);
4756	err = mxge_alloc_slices(sc);
4757	if (err != 0)
4758		goto abort_with_dmabench;
4759
4760	err = mxge_reset(sc, 0);
4761	if (err != 0)
4762		goto abort_with_slices;
4763
4764	err = mxge_alloc_rings(sc);
4765	if (err != 0) {
4766		device_printf(sc->dev, "failed to allocate rings\n");
4767		goto abort_with_slices;
4768	}
4769
4770	err = mxge_add_irq(sc);
4771	if (err != 0) {
4772		device_printf(sc->dev, "failed to add irq\n");
4773		goto abort_with_rings;
4774	}
4775
4776	ifp->if_baudrate = IF_Gbps(10UL);
4777	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4778		IFCAP_VLAN_MTU | IFCAP_LINKSTATE;
4779#ifdef INET
4780	ifp->if_capabilities |= IFCAP_LRO;
4781#endif
4782
4783#ifdef MXGE_NEW_VLAN_API
4784	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4785
4786	/* Only FW 1.4.32 and newer can do TSO over vlans */
4787	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4788	    sc->fw_ver_tiny >= 32)
4789		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4790#endif
4791
4792	sc->max_mtu = mxge_max_mtu(sc);
4793	if (sc->max_mtu >= 9000)
4794		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4795	else
4796		device_printf(dev, "MTU limited to %d.  Install "
4797			      "latest firmware for 9000 byte jumbo support\n",
4798			      sc->max_mtu - ETHER_HDR_LEN);
4799	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4800	ifp->if_capenable = ifp->if_capabilities;
4801	if (sc->lro_cnt == 0)
4802		ifp->if_capenable &= ~IFCAP_LRO;
4803	sc->csum_flag = 1;
4804        ifp->if_init = mxge_init;
4805        ifp->if_softc = sc;
4806        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4807        ifp->if_ioctl = mxge_ioctl;
4808        ifp->if_start = mxge_start;
4809	/* Initialise the ifmedia structure */
4810	ifmedia_init(&sc->media, 0, mxge_media_change,
4811		     mxge_media_status);
4812	mxge_media_init(sc);
4813	mxge_media_probe(sc);
4814	sc->dying = 0;
4815	ether_ifattach(ifp, sc->mac_addr);
4816	/* ether_ifattach sets mtu to ETHERMTU */
4817	if (mxge_initial_mtu != ETHERMTU)
4818		mxge_change_mtu(sc, mxge_initial_mtu);
4819
4820	mxge_add_sysctls(sc);
4821#ifdef IFNET_BUF_RING
4822	ifp->if_transmit = mxge_transmit;
4823	ifp->if_qflush = mxge_qflush;
4824#endif
4825	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4826				device_get_nameunit(sc->dev));
4827	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4828	return 0;
4829
4830abort_with_rings:
4831	mxge_free_rings(sc);
4832abort_with_slices:
4833	mxge_free_slices(sc);
4834abort_with_dmabench:
4835	mxge_dma_free(&sc->dmabench_dma);
4836abort_with_zeropad_dma:
4837	mxge_dma_free(&sc->zeropad_dma);
4838abort_with_cmd_dma:
4839	mxge_dma_free(&sc->cmd_dma);
4840abort_with_mem_res:
4841	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4842abort_with_lock:
4843	pci_disable_busmaster(dev);
4844	mtx_destroy(&sc->cmd_mtx);
4845	mtx_destroy(&sc->driver_mtx);
4846	if_free(ifp);
4847abort_with_parent_dmat:
4848	bus_dma_tag_destroy(sc->parent_dmat);
4849abort_with_tq:
4850	if (sc->tq != NULL) {
4851		taskqueue_drain(sc->tq, &sc->watchdog_task);
4852		taskqueue_free(sc->tq);
4853		sc->tq = NULL;
4854	}
4855abort_with_nothing:
4856	return err;
4857}
4858
4859static int
4860mxge_detach(device_t dev)
4861{
4862	mxge_softc_t *sc = device_get_softc(dev);
4863
4864	if (mxge_vlans_active(sc)) {
4865		device_printf(sc->dev,
4866			      "Detach vlans before removing module\n");
4867		return EBUSY;
4868	}
4869	mtx_lock(&sc->driver_mtx);
4870	sc->dying = 1;
4871	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4872		mxge_close(sc, 0);
4873	mtx_unlock(&sc->driver_mtx);
4874	ether_ifdetach(sc->ifp);
4875	if (sc->tq != NULL) {
4876		taskqueue_drain(sc->tq, &sc->watchdog_task);
4877		taskqueue_free(sc->tq);
4878		sc->tq = NULL;
4879	}
4880	callout_drain(&sc->co_hdl);
4881	ifmedia_removeall(&sc->media);
4882	mxge_dummy_rdma(sc, 0);
4883	mxge_rem_sysctls(sc);
4884	mxge_rem_irq(sc);
4885	mxge_free_rings(sc);
4886	mxge_free_slices(sc);
4887	mxge_dma_free(&sc->dmabench_dma);
4888	mxge_dma_free(&sc->zeropad_dma);
4889	mxge_dma_free(&sc->cmd_dma);
4890	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4891	pci_disable_busmaster(dev);
4892	mtx_destroy(&sc->cmd_mtx);
4893	mtx_destroy(&sc->driver_mtx);
4894	if_free(sc->ifp);
4895	bus_dma_tag_destroy(sc->parent_dmat);
4896	return 0;
4897}
4898
4899static int
4900mxge_shutdown(device_t dev)
4901{
4902	return 0;
4903}
4904
4905/*
4906  This file uses Myri10GE driver indentation.
4907
4908  Local Variables:
4909  c-file-style:"linux"
4910  tab-width:8
4911  End:
4912*/
4913