if_mxge.c revision 170330
1/******************************************************************************
2
3Copyright (c) 2006, Myricom Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Redistributions in binary form must reproduce the above copyright
13    notice, this list of conditions and the following disclaimer in the
14    documentation and/or other materials provided with the distribution.
15
16 3. Neither the name of the Myricom Inc, nor the names of its
17    contributors may be used to endorse or promote products derived from
18    this software without specific prior written permission.
19
20THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30POSSIBILITY OF SUCH DAMAGE.
31
32***************************************************************************/
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: head/sys/dev/mxge/if_mxge.c 170330 2007-06-05 15:02:14Z gallatin $");
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/linker.h>
40#include <sys/firmware.h>
41#include <sys/endian.h>
42#include <sys/sockio.h>
43#include <sys/mbuf.h>
44#include <sys/malloc.h>
45#include <sys/kdb.h>
46#include <sys/kernel.h>
47#include <sys/lock.h>
48#include <sys/module.h>
49#include <sys/memrange.h>
50#include <sys/socket.h>
51#include <sys/sysctl.h>
52#include <sys/sx.h>
53
54#include <net/if.h>
55#include <net/if_arp.h>
56#include <net/ethernet.h>
57#include <net/if_dl.h>
58#include <net/if_media.h>
59
60#include <net/bpf.h>
61
62#include <net/if_types.h>
63#include <net/if_vlan_var.h>
64#include <net/zlib.h>
65
66#include <netinet/in_systm.h>
67#include <netinet/in.h>
68#include <netinet/ip.h>
69#include <netinet/tcp.h>
70
71#include <machine/bus.h>
72#include <machine/in_cksum.h>
73#include <machine/resource.h>
74#include <sys/bus.h>
75#include <sys/rman.h>
76
77#include <dev/pci/pcireg.h>
78#include <dev/pci/pcivar.h>
79
80#include <vm/vm.h>		/* for pmap_mapdev() */
81#include <vm/pmap.h>
82
83#if defined(__i386) || defined(__amd64)
84#include <machine/specialreg.h>
85#endif
86
87#include <dev/mxge/mxge_mcp.h>
88#include <dev/mxge/mcp_gen_header.h>
89#include <dev/mxge/if_mxge_var.h>
90
91/* tunable params */
92static int mxge_nvidia_ecrc_enable = 1;
93static int mxge_force_firmware = 0;
94static int mxge_intr_coal_delay = 30;
95static int mxge_deassert_wait = 1;
96static int mxge_flow_control = 1;
97static int mxge_verbose = 0;
98static int mxge_ticks;
99static char *mxge_fw_unaligned = "mxge_ethp_z8e";
100static char *mxge_fw_aligned = "mxge_eth_z8e";
101
102static int mxge_probe(device_t dev);
103static int mxge_attach(device_t dev);
104static int mxge_detach(device_t dev);
105static int mxge_shutdown(device_t dev);
106static void mxge_intr(void *arg);
107
108static device_method_t mxge_methods[] =
109{
110  /* Device interface */
111  DEVMETHOD(device_probe, mxge_probe),
112  DEVMETHOD(device_attach, mxge_attach),
113  DEVMETHOD(device_detach, mxge_detach),
114  DEVMETHOD(device_shutdown, mxge_shutdown),
115  {0, 0}
116};
117
118static driver_t mxge_driver =
119{
120  "mxge",
121  mxge_methods,
122  sizeof(mxge_softc_t),
123};
124
125static devclass_t mxge_devclass;
126
127/* Declare ourselves to be a child of the PCI bus.*/
128DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
129MODULE_DEPEND(mxge, firmware, 1, 1, 1);
130
131static int mxge_load_firmware(mxge_softc_t *sc);
132static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
133
134static int
135mxge_probe(device_t dev)
136{
137  if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
138      (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
139	  device_set_desc(dev, "Myri10G-PCIE-8A");
140	  return 0;
141  }
142  return ENXIO;
143}
144
145static void
146mxge_enable_wc(mxge_softc_t *sc)
147{
148	struct mem_range_desc mrdesc;
149	vm_paddr_t pa;
150	vm_offset_t len;
151	int err, action;
152
153	len = rman_get_size(sc->mem_res);
154#if defined(__i386) || defined(__amd64)
155	err = pmap_change_attr((vm_offset_t) sc->sram,
156			       len, PAT_WRITE_COMBINING);
157	if (err == 0)
158		return;
159	else
160		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
161			      err);
162#endif
163	pa = rman_get_start(sc->mem_res);
164	mrdesc.mr_base = pa;
165	mrdesc.mr_len = len;
166	mrdesc.mr_flags = MDF_WRITECOMBINE;
167	action = MEMRANGE_SET_UPDATE;
168	strcpy((char *)&mrdesc.mr_owner, "mxge");
169	err = mem_range_attr_set(&mrdesc, &action);
170	if (err != 0) {
171		device_printf(sc->dev,
172			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
173			      (unsigned long)pa, (unsigned long)len, err);
174	} else {
175		sc->wc = 1;
176	}
177}
178
179
180/* callback to get our DMA address */
181static void
182mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
183			 int error)
184{
185	if (error == 0) {
186		*(bus_addr_t *) arg = segs->ds_addr;
187	}
188}
189
190static int
191mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
192		   bus_size_t alignment)
193{
194	int err;
195	device_t dev = sc->dev;
196
197	/* allocate DMAable memory tags */
198	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
199				 alignment,		/* alignment */
200				 4096,			/* boundary */
201				 BUS_SPACE_MAXADDR,	/* low */
202				 BUS_SPACE_MAXADDR,	/* high */
203				 NULL, NULL,		/* filter */
204				 bytes,			/* maxsize */
205				 1,			/* num segs */
206				 4096,			/* maxsegsize */
207				 BUS_DMA_COHERENT,	/* flags */
208				 NULL, NULL,		/* lock */
209				 &dma->dmat);		/* tag */
210	if (err != 0) {
211		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
212		return err;
213	}
214
215	/* allocate DMAable memory & map */
216	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
217			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
218				| BUS_DMA_ZERO),  &dma->map);
219	if (err != 0) {
220		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
221		goto abort_with_dmat;
222	}
223
224	/* load the memory */
225	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
226			      mxge_dmamap_callback,
227			      (void *)&dma->bus_addr, 0);
228	if (err != 0) {
229		device_printf(dev, "couldn't load map (err = %d)\n", err);
230		goto abort_with_mem;
231	}
232	return 0;
233
234abort_with_mem:
235	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
236abort_with_dmat:
237	(void)bus_dma_tag_destroy(dma->dmat);
238	return err;
239}
240
241
242static void
243mxge_dma_free(mxge_dma_t *dma)
244{
245	bus_dmamap_unload(dma->dmat, dma->map);
246	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
247	(void)bus_dma_tag_destroy(dma->dmat);
248}
249
250/*
251 * The eeprom strings on the lanaiX have the format
252 * SN=x\0
253 * MAC=x:x:x:x:x:x\0
254 * PC=text\0
255 */
256
257static int
258mxge_parse_strings(mxge_softc_t *sc)
259{
260#define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
261
262	char *ptr, *limit;
263	int i, found_mac;
264
265	ptr = sc->eeprom_strings;
266	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
267	found_mac = 0;
268	while (ptr < limit && *ptr != '\0') {
269		if (memcmp(ptr, "MAC=", 4) == 0) {
270			ptr += 1;
271			sc->mac_addr_string = ptr;
272			for (i = 0; i < 6; i++) {
273				ptr += 3;
274				if ((ptr + 2) > limit)
275					goto abort;
276				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
277				found_mac = 1;
278			}
279		} else if (memcmp(ptr, "PC=", 3) == 0) {
280			ptr += 3;
281			strncpy(sc->product_code_string, ptr,
282				sizeof (sc->product_code_string) - 1);
283		} else if (memcmp(ptr, "SN=", 3) == 0) {
284			ptr += 3;
285			strncpy(sc->serial_number_string, ptr,
286				sizeof (sc->serial_number_string) - 1);
287		}
288		MXGE_NEXT_STRING(ptr);
289	}
290
291	if (found_mac)
292		return 0;
293
294 abort:
295	device_printf(sc->dev, "failed to parse eeprom_strings\n");
296
297	return ENXIO;
298}
299
300#if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
301static void
302mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
303{
304	uint32_t val;
305	unsigned long base, off;
306	char *va, *cfgptr;
307	device_t pdev, mcp55;
308	uint16_t vendor_id, device_id, word;
309	uintptr_t bus, slot, func, ivend, idev;
310	uint32_t *ptr32;
311
312
313	if (!mxge_nvidia_ecrc_enable)
314		return;
315
316	pdev = device_get_parent(device_get_parent(sc->dev));
317	if (pdev == NULL) {
318		device_printf(sc->dev, "could not find parent?\n");
319		return;
320	}
321	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
322	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
323
324	if (vendor_id != 0x10de)
325		return;
326
327	base = 0;
328
329	if (device_id == 0x005d) {
330		/* ck804, base address is magic */
331		base = 0xe0000000UL;
332	} else if (device_id >= 0x0374 && device_id <= 0x378) {
333		/* mcp55, base address stored in chipset */
334		mcp55 = pci_find_bsf(0, 0, 0);
335		if (mcp55 &&
336		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
337		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
338			word = pci_read_config(mcp55, 0x90, 2);
339			base = ((unsigned long)word & 0x7ffeU) << 25;
340		}
341	}
342	if (!base)
343		return;
344
345	/* XXXX
346	   Test below is commented because it is believed that doing
347	   config read/write beyond 0xff will access the config space
348	   for the next larger function.  Uncomment this and remove
349	   the hacky pmap_mapdev() way of accessing config space when
350	   FreeBSD grows support for extended pcie config space access
351	*/
352#if 0
353	/* See if we can, by some miracle, access the extended
354	   config space */
355	val = pci_read_config(pdev, 0x178, 4);
356	if (val != 0xffffffff) {
357		val |= 0x40;
358		pci_write_config(pdev, 0x178, val, 4);
359		return;
360	}
361#endif
362	/* Rather than using normal pci config space writes, we must
363	 * map the Nvidia config space ourselves.  This is because on
364	 * opteron/nvidia class machine the 0xe000000 mapping is
365	 * handled by the nvidia chipset, that means the internal PCI
366	 * device (the on-chip northbridge), or the amd-8131 bridge
367	 * and things behind them are not visible by this method.
368	 */
369
370	BUS_READ_IVAR(device_get_parent(pdev), pdev,
371		      PCI_IVAR_BUS, &bus);
372	BUS_READ_IVAR(device_get_parent(pdev), pdev,
373		      PCI_IVAR_SLOT, &slot);
374	BUS_READ_IVAR(device_get_parent(pdev), pdev,
375		      PCI_IVAR_FUNCTION, &func);
376	BUS_READ_IVAR(device_get_parent(pdev), pdev,
377		      PCI_IVAR_VENDOR, &ivend);
378	BUS_READ_IVAR(device_get_parent(pdev), pdev,
379		      PCI_IVAR_DEVICE, &idev);
380
381	off =  base
382		+ 0x00100000UL * (unsigned long)bus
383		+ 0x00001000UL * (unsigned long)(func
384						 + 8 * slot);
385
386	/* map it into the kernel */
387	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
388
389
390	if (va == NULL) {
391		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
392		return;
393	}
394	/* get a pointer to the config space mapped into the kernel */
395	cfgptr = va + (off & PAGE_MASK);
396
397	/* make sure that we can really access it */
398	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
399	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
400	if (! (vendor_id == ivend && device_id == idev)) {
401		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
402			      vendor_id, device_id);
403		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
404		return;
405	}
406
407	ptr32 = (uint32_t*)(cfgptr + 0x178);
408	val = *ptr32;
409
410	if (val == 0xffffffff) {
411		device_printf(sc->dev, "extended mapping failed\n");
412		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
413		return;
414	}
415	*ptr32 = val | 0x40;
416	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
417	if (mxge_verbose)
418		device_printf(sc->dev,
419			      "Enabled ECRC on upstream Nvidia bridge "
420			      "at %d:%d:%d\n",
421			      (int)bus, (int)slot, (int)func);
422	return;
423}
424#else
425static void
426mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
427{
428	device_printf(sc->dev,
429		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
430	return;
431}
432#endif
433
434
435static int
436mxge_dma_test(mxge_softc_t *sc, int test_type)
437{
438	mxge_cmd_t cmd;
439	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
440	int status;
441	uint32_t len;
442	char *test = " ";
443
444
445	/* Run a small DMA test.
446	 * The magic multipliers to the length tell the firmware
447	 * to do DMA read, write, or read+write tests.  The
448	 * results are returned in cmd.data0.  The upper 16
449	 * bits of the return is the number of transfers completed.
450	 * The lower 16 bits is the time in 0.5us ticks that the
451	 * transfers took to complete.
452	 */
453
454	len = sc->tx.boundary;
455
456	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
457	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
458	cmd.data2 = len * 0x10000;
459	status = mxge_send_cmd(sc, test_type, &cmd);
460	if (status != 0) {
461		test = "read";
462		goto abort;
463	}
464	sc->read_dma = ((cmd.data0>>16) * len * 2) /
465		(cmd.data0 & 0xffff);
466	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
467	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
468	cmd.data2 = len * 0x1;
469	status = mxge_send_cmd(sc, test_type, &cmd);
470	if (status != 0) {
471		test = "write";
472		goto abort;
473	}
474	sc->write_dma = ((cmd.data0>>16) * len * 2) /
475		(cmd.data0 & 0xffff);
476
477	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
478	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
479	cmd.data2 = len * 0x10001;
480	status = mxge_send_cmd(sc, test_type, &cmd);
481	if (status != 0) {
482		test = "read/write";
483		goto abort;
484	}
485	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
486		(cmd.data0 & 0xffff);
487
488abort:
489	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
490		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
491			      test, status);
492
493	return status;
494}
495
496/*
497 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
498 * when the PCI-E Completion packets are aligned on an 8-byte
499 * boundary.  Some PCI-E chip sets always align Completion packets; on
500 * the ones that do not, the alignment can be enforced by enabling
501 * ECRC generation (if supported).
502 *
503 * When PCI-E Completion packets are not aligned, it is actually more
504 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
505 *
506 * If the driver can neither enable ECRC nor verify that it has
507 * already been enabled, then it must use a firmware image which works
508 * around unaligned completion packets (ethp_z8e.dat), and it should
509 * also ensure that it never gives the device a Read-DMA which is
510 * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
511 * enabled, then the driver should use the aligned (eth_z8e.dat)
512 * firmware image, and set tx.boundary to 4KB.
513 */
514
515static int
516mxge_firmware_probe(mxge_softc_t *sc)
517{
518	device_t dev = sc->dev;
519	int reg, status;
520	uint16_t pectl;
521
522	sc->tx.boundary = 4096;
523	/*
524	 * Verify the max read request size was set to 4KB
525	 * before trying the test with 4KB.
526	 */
527	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
528		pectl = pci_read_config(dev, reg + 0x8, 2);
529		if ((pectl & (5 << 12)) != (5 << 12)) {
530			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
531				      pectl);
532			sc->tx.boundary = 2048;
533		}
534	}
535
536	/*
537	 * load the optimized firmware (which assumes aligned PCIe
538	 * completions) in order to see if it works on this host.
539	 */
540	sc->fw_name = mxge_fw_aligned;
541	status = mxge_load_firmware(sc);
542	if (status != 0) {
543		return status;
544	}
545
546	/*
547	 * Enable ECRC if possible
548	 */
549	mxge_enable_nvidia_ecrc(sc);
550
551	/*
552	 * Run a DMA test which watches for unaligned completions and
553	 * aborts on the first one seen.
554	 */
555
556	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
557	if (status == 0)
558		return 0; /* keep the aligned firmware */
559
560	if (status != E2BIG)
561		device_printf(dev, "DMA test failed: %d\n", status);
562	if (status == ENOSYS)
563		device_printf(dev, "Falling back to ethp! "
564			      "Please install up to date fw\n");
565	return status;
566}
567
568static int
569mxge_select_firmware(mxge_softc_t *sc)
570{
571	int aligned = 0;
572
573
574	if (mxge_force_firmware != 0) {
575		if (mxge_force_firmware == 1)
576			aligned = 1;
577		else
578			aligned = 0;
579		if (mxge_verbose)
580			device_printf(sc->dev,
581				      "Assuming %s completions (forced)\n",
582				      aligned ? "aligned" : "unaligned");
583		goto abort;
584	}
585
586	/* if the PCIe link width is 4 or less, we can use the aligned
587	   firmware and skip any checks */
588	if (sc->link_width != 0 && sc->link_width <= 4) {
589		device_printf(sc->dev,
590			      "PCIe x%d Link, expect reduced performance\n",
591			      sc->link_width);
592		aligned = 1;
593		goto abort;
594	}
595
596	if (0 == mxge_firmware_probe(sc))
597		return 0;
598
599abort:
600	if (aligned) {
601		sc->fw_name = mxge_fw_aligned;
602		sc->tx.boundary = 4096;
603	} else {
604		sc->fw_name = mxge_fw_unaligned;
605		sc->tx.boundary = 2048;
606	}
607	return (mxge_load_firmware(sc));
608}
609
610union qualhack
611{
612        const char *ro_char;
613        char *rw_char;
614};
615
616static int
617mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
618{
619
620
621	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
622		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
623			      be32toh(hdr->mcp_type));
624		return EIO;
625	}
626
627	/* save firmware version for sysctl */
628	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
629	if (mxge_verbose)
630		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
631
632	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
633	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
634
635	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
636	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
637		device_printf(sc->dev, "Found firmware version %s\n",
638			      sc->fw_version);
639		device_printf(sc->dev, "Driver needs %d.%d\n",
640			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
641		return EINVAL;
642	}
643	return 0;
644
645}
646
647static int
648mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
649{
650	const struct firmware *fw;
651	const mcp_gen_header_t *hdr;
652	unsigned hdr_offset;
653	const char *fw_data;
654	union qualhack hack;
655	int status;
656	unsigned int i;
657	char dummy;
658
659
660	fw = firmware_get(sc->fw_name);
661
662	if (fw == NULL) {
663		device_printf(sc->dev, "Could not find firmware image %s\n",
664			      sc->fw_name);
665		return ENOENT;
666	}
667	if (fw->datasize > *limit ||
668	    fw->datasize < MCP_HEADER_PTR_OFFSET + 4) {
669		device_printf(sc->dev, "Firmware image %s too large (%d/%d)\n",
670			      sc->fw_name, (int)fw->datasize, (int) *limit);
671		status = ENOSPC;
672		goto abort_with_fw;
673	}
674	*limit = fw->datasize;
675
676	/* check id */
677	fw_data = (const char *)fw->data;
678	hdr_offset = htobe32(*(const uint32_t *)
679			     (fw_data + MCP_HEADER_PTR_OFFSET));
680	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->datasize) {
681		device_printf(sc->dev, "Bad firmware file");
682		status = EIO;
683		goto abort_with_fw;
684	}
685	hdr = (const void*)(fw_data + hdr_offset);
686
687	status = mxge_validate_firmware(sc, hdr);
688	if (status != 0)
689		goto abort_with_fw;
690
691	hack.ro_char = fw_data;
692	/* Copy the inflated firmware to NIC SRAM. */
693	for (i = 0; i < *limit; i += 256) {
694		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
695			      hack.rw_char + i,
696			      min(256U, (unsigned)(*limit - i)));
697		mb();
698		dummy = *sc->sram;
699		mb();
700	}
701
702	status = 0;
703abort_with_fw:
704	firmware_put(fw, FIRMWARE_UNLOAD);
705	return status;
706}
707
708/*
709 * Enable or disable periodic RDMAs from the host to make certain
710 * chipsets resend dropped PCIe messages
711 */
712
713static void
714mxge_dummy_rdma(mxge_softc_t *sc, int enable)
715{
716	char buf_bytes[72];
717	volatile uint32_t *confirm;
718	volatile char *submit;
719	uint32_t *buf, dma_low, dma_high;
720	int i;
721
722	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
723
724	/* clear confirmation addr */
725	confirm = (volatile uint32_t *)sc->cmd;
726	*confirm = 0;
727	mb();
728
729	/* send an rdma command to the PCIe engine, and wait for the
730	   response in the confirmation address.  The firmware should
731	   write a -1 there to indicate it is alive and well
732	*/
733
734	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
735	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
736	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
737	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
738	buf[2] = htobe32(0xffffffff);		/* confirm data */
739	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
740	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
741	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
742	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
743	buf[5] = htobe32(enable);			/* enable? */
744
745
746	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
747
748	mxge_pio_copy(submit, buf, 64);
749	mb();
750	DELAY(1000);
751	mb();
752	i = 0;
753	while (*confirm != 0xffffffff && i < 20) {
754		DELAY(1000);
755		i++;
756	}
757	if (*confirm != 0xffffffff) {
758		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
759			      (enable ? "enable" : "disable"), confirm,
760			      *confirm);
761	}
762	return;
763}
764
765static int
766mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
767{
768	mcp_cmd_t *buf;
769	char buf_bytes[sizeof(*buf) + 8];
770	volatile mcp_cmd_response_t *response = sc->cmd;
771	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
772	uint32_t dma_low, dma_high;
773	int err, sleep_total = 0;
774
775	/* ensure buf is aligned to 8 bytes */
776	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
777
778	buf->data0 = htobe32(data->data0);
779	buf->data1 = htobe32(data->data1);
780	buf->data2 = htobe32(data->data2);
781	buf->cmd = htobe32(cmd);
782	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
783	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
784
785	buf->response_addr.low = htobe32(dma_low);
786	buf->response_addr.high = htobe32(dma_high);
787	mtx_lock(&sc->cmd_mtx);
788	response->result = 0xffffffff;
789	mb();
790	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
791
792	/* wait up to 20ms */
793	err = EAGAIN;
794	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
795		bus_dmamap_sync(sc->cmd_dma.dmat,
796				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
797		mb();
798		switch (be32toh(response->result)) {
799		case 0:
800			data->data0 = be32toh(response->data);
801			err = 0;
802			break;
803		case 0xffffffff:
804			DELAY(1000);
805			break;
806		case MXGEFW_CMD_UNKNOWN:
807			err = ENOSYS;
808			break;
809		case MXGEFW_CMD_ERROR_UNALIGNED:
810			err = E2BIG;
811			break;
812		default:
813			device_printf(sc->dev,
814				      "mxge: command %d "
815				      "failed, result = %d\n",
816				      cmd, be32toh(response->result));
817			err = ENXIO;
818			break;
819		}
820		if (err != EAGAIN)
821			break;
822	}
823	if (err == EAGAIN)
824		device_printf(sc->dev, "mxge: command %d timed out"
825			      "result = %d\n",
826			      cmd, be32toh(response->result));
827	mtx_unlock(&sc->cmd_mtx);
828	return err;
829}
830
831static int
832mxge_adopt_running_firmware(mxge_softc_t *sc)
833{
834	struct mcp_gen_header *hdr;
835	const size_t bytes = sizeof (struct mcp_gen_header);
836	size_t hdr_offset;
837	int status;
838
839	/* find running firmware header */
840	hdr_offset = htobe32(*(volatile uint32_t *)
841			     (sc->sram + MCP_HEADER_PTR_OFFSET));
842
843	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
844		device_printf(sc->dev,
845			      "Running firmware has bad header offset (%d)\n",
846			      (int)hdr_offset);
847		return EIO;
848	}
849
850	/* copy header of running firmware from SRAM to host memory to
851	 * validate firmware */
852	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
853	if (hdr == NULL) {
854		device_printf(sc->dev, "could not malloc firmware hdr\n");
855		return ENOMEM;
856	}
857	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
858				rman_get_bushandle(sc->mem_res),
859				hdr_offset, (char *)hdr, bytes);
860	status = mxge_validate_firmware(sc, hdr);
861	free(hdr, M_DEVBUF);
862
863	/*
864	 * check to see if adopted firmware has bug where adopting
865	 * it will cause broadcasts to be filtered unless the NIC
866	 * is kept in ALLMULTI mode
867	 */
868	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
869	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
870		sc->adopted_rx_filter_bug = 1;
871		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
872			      "working around rx filter bug\n",
873			      sc->fw_ver_major, sc->fw_ver_minor,
874			      sc->fw_ver_tiny);
875	}
876
877	return status;
878}
879
880
881static int
882mxge_load_firmware(mxge_softc_t *sc)
883{
884	volatile uint32_t *confirm;
885	volatile char *submit;
886	char buf_bytes[72];
887	uint32_t *buf, size, dma_low, dma_high;
888	int status, i;
889
890	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
891
892	size = sc->sram_size;
893	status = mxge_load_firmware_helper(sc, &size);
894	if (status) {
895		/* Try to use the currently running firmware, if
896		   it is new enough */
897		status = mxge_adopt_running_firmware(sc);
898		if (status) {
899			device_printf(sc->dev,
900				      "failed to adopt running firmware\n");
901			return status;
902		}
903		device_printf(sc->dev,
904			      "Successfully adopted running firmware\n");
905		if (sc->tx.boundary == 4096) {
906			device_printf(sc->dev,
907				"Using firmware currently running on NIC"
908				 ".  For optimal\n");
909			device_printf(sc->dev,
910				 "performance consider loading optimized "
911				 "firmware\n");
912		}
913		sc->fw_name = mxge_fw_unaligned;
914		sc->tx.boundary = 2048;
915		return 0;
916	}
917	/* clear confirmation addr */
918	confirm = (volatile uint32_t *)sc->cmd;
919	*confirm = 0;
920	mb();
921	/* send a reload command to the bootstrap MCP, and wait for the
922	   response in the confirmation address.  The firmware should
923	   write a -1 there to indicate it is alive and well
924	*/
925
926	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
927	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
928
929	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
930	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
931	buf[2] = htobe32(0xffffffff);	/* confirm data */
932
933	/* FIX: All newest firmware should un-protect the bottom of
934	   the sram before handoff. However, the very first interfaces
935	   do not. Therefore the handoff copy must skip the first 8 bytes
936	*/
937					/* where the code starts*/
938	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
939	buf[4] = htobe32(size - 8); 	/* length of code */
940	buf[5] = htobe32(8);		/* where to copy to */
941	buf[6] = htobe32(0);		/* where to jump to */
942
943	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
944	mxge_pio_copy(submit, buf, 64);
945	mb();
946	DELAY(1000);
947	mb();
948	i = 0;
949	while (*confirm != 0xffffffff && i < 20) {
950		DELAY(1000*10);
951		i++;
952		bus_dmamap_sync(sc->cmd_dma.dmat,
953				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
954	}
955	if (*confirm != 0xffffffff) {
956		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
957			confirm, *confirm);
958
959		return ENXIO;
960	}
961	return 0;
962}
963
964static int
965mxge_update_mac_address(mxge_softc_t *sc)
966{
967	mxge_cmd_t cmd;
968	uint8_t *addr = sc->mac_addr;
969	int status;
970
971
972	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
973		     | (addr[2] << 8) | addr[3]);
974
975	cmd.data1 = ((addr[4] << 8) | (addr[5]));
976
977	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
978	return status;
979}
980
981static int
982mxge_change_pause(mxge_softc_t *sc, int pause)
983{
984	mxge_cmd_t cmd;
985	int status;
986
987	if (pause)
988		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
989				       &cmd);
990	else
991		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
992				       &cmd);
993
994	if (status) {
995		device_printf(sc->dev, "Failed to set flow control mode\n");
996		return ENXIO;
997	}
998	sc->pause = pause;
999	return 0;
1000}
1001
1002static void
1003mxge_change_promisc(mxge_softc_t *sc, int promisc)
1004{
1005	mxge_cmd_t cmd;
1006	int status;
1007
1008	if (promisc)
1009		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1010				       &cmd);
1011	else
1012		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1013				       &cmd);
1014
1015	if (status) {
1016		device_printf(sc->dev, "Failed to set promisc mode\n");
1017	}
1018}
1019
1020static void
1021mxge_set_multicast_list(mxge_softc_t *sc)
1022{
1023	mxge_cmd_t cmd;
1024	struct ifmultiaddr *ifma;
1025	struct ifnet *ifp = sc->ifp;
1026	int err;
1027
1028	/* This firmware is known to not support multicast */
1029	if (!sc->fw_multicast_support)
1030		return;
1031
1032	/* Disable multicast filtering while we play with the lists*/
1033	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1034	if (err != 0) {
1035		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1036		       " error status: %d\n", err);
1037		return;
1038	}
1039
1040	if (sc->adopted_rx_filter_bug)
1041		return;
1042
1043	if (ifp->if_flags & IFF_ALLMULTI)
1044		/* request to disable multicast filtering, so quit here */
1045		return;
1046
1047	/* Flush all the filters */
1048
1049	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1050	if (err != 0) {
1051		device_printf(sc->dev,
1052			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1053			      ", error status: %d\n", err);
1054		return;
1055	}
1056
1057	/* Walk the multicast list, and add each address */
1058
1059	IF_ADDR_LOCK(ifp);
1060	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1061		if (ifma->ifma_addr->sa_family != AF_LINK)
1062			continue;
1063		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1064		      &cmd.data0, 4);
1065		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1066		      &cmd.data1, 2);
1067		cmd.data0 = htonl(cmd.data0);
1068		cmd.data1 = htonl(cmd.data1);
1069		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1070		if (err != 0) {
1071			device_printf(sc->dev, "Failed "
1072			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1073			       "%d\t", err);
1074			/* abort, leaving multicast filtering off */
1075			IF_ADDR_UNLOCK(ifp);
1076			return;
1077		}
1078	}
1079	IF_ADDR_UNLOCK(ifp);
1080	/* Enable multicast filtering */
1081	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1082	if (err != 0) {
1083		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1084		       ", error status: %d\n", err);
1085	}
1086}
1087
1088static int
1089mxge_max_mtu(mxge_softc_t *sc)
1090{
1091	mxge_cmd_t cmd;
1092	int status;
1093
1094	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1095		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1096
1097	/* try to set nbufs to see if it we can
1098	   use virtually contiguous jumbos */
1099	cmd.data0 = 0;
1100	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1101			       &cmd);
1102	if (status == 0)
1103		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1104
1105	/* otherwise, we're limited to MJUMPAGESIZE */
1106	return MJUMPAGESIZE - MXGEFW_PAD;
1107}
1108
1109static int
1110mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1111{
1112
1113	mxge_cmd_t cmd;
1114	size_t bytes;
1115	int status;
1116
1117	/* try to send a reset command to the card to see if it
1118	   is alive */
1119	memset(&cmd, 0, sizeof (cmd));
1120	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1121	if (status != 0) {
1122		device_printf(sc->dev, "failed reset\n");
1123		return ENXIO;
1124	}
1125
1126	mxge_dummy_rdma(sc, 1);
1127
1128	if (interrupts_setup) {
1129		/* Now exchange information about interrupts  */
1130		bytes = (sc->rx_done.mask + 1) * sizeof (*sc->rx_done.entry);
1131		memset(sc->rx_done.entry, 0, bytes);
1132		cmd.data0 = (uint32_t)bytes;
1133		status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1134		cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
1135		cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
1136		status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
1137	}
1138
1139	status |= mxge_send_cmd(sc,
1140				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1141
1142
1143	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1144
1145	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1146	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1147
1148
1149	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1150				&cmd);
1151	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1152	if (status != 0) {
1153		device_printf(sc->dev, "failed set interrupt parameters\n");
1154		return status;
1155	}
1156
1157
1158	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1159
1160
1161	/* run a DMA benchmark */
1162	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1163
1164	/* reset mcp/driver shared state back to 0 */
1165	sc->rx_done.idx = 0;
1166	sc->rx_done.cnt = 0;
1167	sc->tx.req = 0;
1168	sc->tx.done = 0;
1169	sc->tx.pkt_done = 0;
1170	sc->tx.wake = 0;
1171	sc->tx_defrag = 0;
1172	sc->tx.stall = 0;
1173	sc->rx_big.cnt = 0;
1174	sc->rx_small.cnt = 0;
1175	sc->rdma_tags_available = 15;
1176	sc->fw_stats->valid = 0;
1177	sc->fw_stats->send_done_count = 0;
1178	sc->lro_bad_csum = 0;
1179	sc->lro_queued = 0;
1180	sc->lro_flushed = 0;
1181	status = mxge_update_mac_address(sc);
1182	mxge_change_promisc(sc, 0);
1183	mxge_change_pause(sc, sc->pause);
1184	mxge_set_multicast_list(sc);
1185	return status;
1186}
1187
1188static int
1189mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1190{
1191        mxge_softc_t *sc;
1192        unsigned int intr_coal_delay;
1193        int err;
1194
1195        sc = arg1;
1196        intr_coal_delay = sc->intr_coal_delay;
1197        err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1198        if (err != 0) {
1199                return err;
1200        }
1201        if (intr_coal_delay == sc->intr_coal_delay)
1202                return 0;
1203
1204        if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1205                return EINVAL;
1206
1207	mtx_lock(&sc->driver_mtx);
1208	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1209	sc->intr_coal_delay = intr_coal_delay;
1210
1211	mtx_unlock(&sc->driver_mtx);
1212        return err;
1213}
1214
1215static int
1216mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1217{
1218        mxge_softc_t *sc;
1219        unsigned int enabled;
1220        int err;
1221
1222        sc = arg1;
1223        enabled = sc->pause;
1224        err = sysctl_handle_int(oidp, &enabled, arg2, req);
1225        if (err != 0) {
1226                return err;
1227        }
1228        if (enabled == sc->pause)
1229                return 0;
1230
1231	mtx_lock(&sc->driver_mtx);
1232	err = mxge_change_pause(sc, enabled);
1233	mtx_unlock(&sc->driver_mtx);
1234        return err;
1235}
1236
1237static int
1238mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1239{
1240        int err;
1241
1242        if (arg1 == NULL)
1243                return EFAULT;
1244        arg2 = be32toh(*(int *)arg1);
1245        arg1 = NULL;
1246        err = sysctl_handle_int(oidp, arg1, arg2, req);
1247
1248        return err;
1249}
1250
1251static void
1252mxge_add_sysctls(mxge_softc_t *sc)
1253{
1254	struct sysctl_ctx_list *ctx;
1255	struct sysctl_oid_list *children;
1256	mcp_irq_data_t *fw;
1257
1258	ctx = device_get_sysctl_ctx(sc->dev);
1259	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1260	fw = sc->fw_stats;
1261
1262	/* random information */
1263	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1264		       "firmware_version",
1265		       CTLFLAG_RD, &sc->fw_version,
1266		       0, "firmware version");
1267	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1268		       "serial_number",
1269		       CTLFLAG_RD, &sc->serial_number_string,
1270		       0, "serial number");
1271	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1272		       "product_code",
1273		       CTLFLAG_RD, &sc->product_code_string,
1274		       0, "product_code");
1275	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1276		       "pcie_link_width",
1277		       CTLFLAG_RD, &sc->link_width,
1278		       0, "tx_boundary");
1279	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1280		       "tx_boundary",
1281		       CTLFLAG_RD, &sc->tx.boundary,
1282		       0, "tx_boundary");
1283	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1284		       "write_combine",
1285		       CTLFLAG_RD, &sc->wc,
1286		       0, "write combining PIO?");
1287	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1288		       "read_dma_MBs",
1289		       CTLFLAG_RD, &sc->read_dma,
1290		       0, "DMA Read speed in MB/s");
1291	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1292		       "write_dma_MBs",
1293		       CTLFLAG_RD, &sc->write_dma,
1294		       0, "DMA Write speed in MB/s");
1295	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1296		       "read_write_dma_MBs",
1297		       CTLFLAG_RD, &sc->read_write_dma,
1298		       0, "DMA concurrent Read/Write speed in MB/s");
1299
1300
1301	/* performance related tunables */
1302	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1303			"intr_coal_delay",
1304			CTLTYPE_INT|CTLFLAG_RW, sc,
1305			0, mxge_change_intr_coal,
1306			"I", "interrupt coalescing delay in usecs");
1307
1308	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1309			"flow_control_enabled",
1310			CTLTYPE_INT|CTLFLAG_RW, sc,
1311			0, mxge_change_flow_control,
1312			"I", "interrupt coalescing delay in usecs");
1313
1314	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1315		       "deassert_wait",
1316		       CTLFLAG_RW, &mxge_deassert_wait,
1317		       0, "Wait for IRQ line to go low in ihandler");
1318
1319	/* stats block from firmware is in network byte order.
1320	   Need to swap it */
1321	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1322			"link_up",
1323			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1324			0, mxge_handle_be32,
1325			"I", "link up");
1326	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1327			"rdma_tags_available",
1328			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1329			0, mxge_handle_be32,
1330			"I", "rdma_tags_available");
1331	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1332			"dropped_bad_crc32",
1333			CTLTYPE_INT|CTLFLAG_RD,
1334			&fw->dropped_bad_crc32,
1335			0, mxge_handle_be32,
1336			"I", "dropped_bad_crc32");
1337	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1338			"dropped_bad_phy",
1339			CTLTYPE_INT|CTLFLAG_RD,
1340			&fw->dropped_bad_phy,
1341			0, mxge_handle_be32,
1342			"I", "dropped_bad_phy");
1343	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1344			"dropped_link_error_or_filtered",
1345			CTLTYPE_INT|CTLFLAG_RD,
1346			&fw->dropped_link_error_or_filtered,
1347			0, mxge_handle_be32,
1348			"I", "dropped_link_error_or_filtered");
1349	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1350			"dropped_link_overflow",
1351			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1352			0, mxge_handle_be32,
1353			"I", "dropped_link_overflow");
1354	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1355			"dropped_multicast_filtered",
1356			CTLTYPE_INT|CTLFLAG_RD,
1357			&fw->dropped_multicast_filtered,
1358			0, mxge_handle_be32,
1359			"I", "dropped_multicast_filtered");
1360	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1361			"dropped_no_big_buffer",
1362			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1363			0, mxge_handle_be32,
1364			"I", "dropped_no_big_buffer");
1365	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1366			"dropped_no_small_buffer",
1367			CTLTYPE_INT|CTLFLAG_RD,
1368			&fw->dropped_no_small_buffer,
1369			0, mxge_handle_be32,
1370			"I", "dropped_no_small_buffer");
1371	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1372			"dropped_overrun",
1373			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1374			0, mxge_handle_be32,
1375			"I", "dropped_overrun");
1376	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1377			"dropped_pause",
1378			CTLTYPE_INT|CTLFLAG_RD,
1379			&fw->dropped_pause,
1380			0, mxge_handle_be32,
1381			"I", "dropped_pause");
1382	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1383			"dropped_runt",
1384			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1385			0, mxge_handle_be32,
1386			"I", "dropped_runt");
1387
1388	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1389			"dropped_unicast_filtered",
1390			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1391			0, mxge_handle_be32,
1392			"I", "dropped_unicast_filtered");
1393
1394	/* host counters exported for debugging */
1395	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1396		       "rx_small_cnt",
1397		       CTLFLAG_RD, &sc->rx_small.cnt,
1398		       0, "rx_small_cnt");
1399	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1400		       "rx_big_cnt",
1401		       CTLFLAG_RD, &sc->rx_big.cnt,
1402		       0, "rx_small_cnt");
1403	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1404		       "tx_req",
1405		       CTLFLAG_RD, &sc->tx.req,
1406		       0, "tx_req");
1407	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1408		       "tx_done",
1409		       CTLFLAG_RD, &sc->tx.done,
1410		       0, "tx_done");
1411	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1412		       "tx_pkt_done",
1413		       CTLFLAG_RD, &sc->tx.pkt_done,
1414		       0, "tx_done");
1415	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1416		       "tx_stall",
1417		       CTLFLAG_RD, &sc->tx.stall,
1418		       0, "tx_stall");
1419	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1420		       "tx_wake",
1421		       CTLFLAG_RD, &sc->tx.wake,
1422		       0, "tx_wake");
1423	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1424		       "tx_defrag",
1425		       CTLFLAG_RD, &sc->tx_defrag,
1426		       0, "tx_defrag");
1427
1428	/* verbose printing? */
1429	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1430		       "verbose",
1431		       CTLFLAG_RW, &mxge_verbose,
1432		       0, "verbose printing");
1433
1434	/* lro */
1435	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1436		       "lro_cnt", CTLFLAG_RD, &sc->lro_cnt,
1437		       0, "number of lro merge queues");
1438
1439	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1440		       "lro_flushed", CTLFLAG_RD, &sc->lro_flushed,
1441		       0, "number of lro merge queues flushed");
1442
1443	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1444		       "lro_queued", CTLFLAG_RD, &sc->lro_queued,
1445		       0, "number of frames appended to lro merge queues");
1446
1447}
1448
1449/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1450   backwards one at a time and handle ring wraps */
1451
1452static inline void
1453mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1454			    mcp_kreq_ether_send_t *src, int cnt)
1455{
1456        int idx, starting_slot;
1457        starting_slot = tx->req;
1458        while (cnt > 1) {
1459                cnt--;
1460                idx = (starting_slot + cnt) & tx->mask;
1461                mxge_pio_copy(&tx->lanai[idx],
1462			      &src[cnt], sizeof(*src));
1463                mb();
1464        }
1465}
1466
1467/*
1468 * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1469 * at most 32 bytes at a time, so as to avoid involving the software
1470 * pio handler in the nic.   We re-write the first segment's flags
1471 * to mark them valid only after writing the entire chain
1472 */
1473
1474static inline void
1475mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1476                  int cnt)
1477{
1478        int idx, i;
1479        uint32_t *src_ints;
1480	volatile uint32_t *dst_ints;
1481        mcp_kreq_ether_send_t *srcp;
1482	volatile mcp_kreq_ether_send_t *dstp, *dst;
1483	uint8_t last_flags;
1484
1485        idx = tx->req & tx->mask;
1486
1487	last_flags = src->flags;
1488	src->flags = 0;
1489        mb();
1490        dst = dstp = &tx->lanai[idx];
1491        srcp = src;
1492
1493        if ((idx + cnt) < tx->mask) {
1494                for (i = 0; i < (cnt - 1); i += 2) {
1495                        mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1496                        mb(); /* force write every 32 bytes */
1497                        srcp += 2;
1498                        dstp += 2;
1499                }
1500        } else {
1501                /* submit all but the first request, and ensure
1502                   that it is submitted below */
1503                mxge_submit_req_backwards(tx, src, cnt);
1504                i = 0;
1505        }
1506        if (i < cnt) {
1507                /* submit the first request */
1508                mxge_pio_copy(dstp, srcp, sizeof(*src));
1509                mb(); /* barrier before setting valid flag */
1510        }
1511
1512        /* re-write the last 32-bits with the valid flags */
1513        src->flags = last_flags;
1514        src_ints = (uint32_t *)src;
1515        src_ints+=3;
1516        dst_ints = (volatile uint32_t *)dst;
1517        dst_ints+=3;
1518        *dst_ints =  *src_ints;
1519        tx->req += cnt;
1520        mb();
1521}
1522
1523static void
1524mxge_encap_tso(mxge_softc_t *sc, struct mbuf *m, int busdma_seg_cnt,
1525	       int ip_off)
1526{
1527	mxge_tx_buf_t *tx;
1528	mcp_kreq_ether_send_t *req;
1529	bus_dma_segment_t *seg;
1530	struct ip *ip;
1531	struct tcphdr *tcp;
1532	uint32_t low, high_swapped;
1533	int len, seglen, cum_len, cum_len_next;
1534	int next_is_first, chop, cnt, rdma_count, small;
1535	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1536	uint8_t flags, flags_next;
1537	static int once;
1538
1539	mss = m->m_pkthdr.tso_segsz;
1540
1541	/* negative cum_len signifies to the
1542	 * send loop that we are still in the
1543	 * header portion of the TSO packet.
1544	 */
1545
1546	/* ensure we have the ethernet, IP and TCP
1547	   header together in the first mbuf, copy
1548	   it to a scratch buffer if not */
1549	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1550		m_copydata(m, 0, ip_off + sizeof (*ip),
1551			   sc->scratch);
1552		ip = (struct ip *)(sc->scratch + ip_off);
1553	} else {
1554		ip = (struct ip *)(mtod(m, char *) + ip_off);
1555	}
1556	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1557			    + sizeof (*tcp))) {
1558		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1559			   + sizeof (*tcp),  sc->scratch);
1560		ip = (struct ip *)(mtod(m, char *) + ip_off);
1561	}
1562
1563	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1564	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1565
1566	/* TSO implies checksum offload on this hardware */
1567	cksum_offset = ip_off + (ip->ip_hl << 2);
1568	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1569
1570
1571	/* for TSO, pseudo_hdr_offset holds mss.
1572	 * The firmware figures out where to put
1573	 * the checksum by parsing the header. */
1574	pseudo_hdr_offset = htobe16(mss);
1575
1576	tx = &sc->tx;
1577	req = tx->req_list;
1578	seg = tx->seg_list;
1579	cnt = 0;
1580	rdma_count = 0;
1581	/* "rdma_count" is the number of RDMAs belonging to the
1582	 * current packet BEFORE the current send request. For
1583	 * non-TSO packets, this is equal to "count".
1584	 * For TSO packets, rdma_count needs to be reset
1585	 * to 0 after a segment cut.
1586	 *
1587	 * The rdma_count field of the send request is
1588	 * the number of RDMAs of the packet starting at
1589	 * that request. For TSO send requests with one ore more cuts
1590	 * in the middle, this is the number of RDMAs starting
1591	 * after the last cut in the request. All previous
1592	 * segments before the last cut implicitly have 1 RDMA.
1593	 *
1594	 * Since the number of RDMAs is not known beforehand,
1595	 * it must be filled-in retroactively - after each
1596	 * segmentation cut or at the end of the entire packet.
1597	 */
1598
1599	while (busdma_seg_cnt) {
1600		/* Break the busdma segment up into pieces*/
1601		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1602		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1603		len = seg->ds_len;
1604
1605		while (len) {
1606			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1607			seglen = len;
1608			cum_len_next = cum_len + seglen;
1609			(req-rdma_count)->rdma_count = rdma_count + 1;
1610			if (__predict_true(cum_len >= 0)) {
1611				/* payload */
1612				chop = (cum_len_next > mss);
1613				cum_len_next = cum_len_next % mss;
1614				next_is_first = (cum_len_next == 0);
1615				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1616				flags_next |= next_is_first *
1617					MXGEFW_FLAGS_FIRST;
1618				rdma_count |= -(chop | next_is_first);
1619				rdma_count += chop & !next_is_first;
1620			} else if (cum_len_next >= 0) {
1621				/* header ends */
1622				rdma_count = -1;
1623				cum_len_next = 0;
1624				seglen = -cum_len;
1625				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1626				flags_next = MXGEFW_FLAGS_TSO_PLD |
1627					MXGEFW_FLAGS_FIRST |
1628					(small * MXGEFW_FLAGS_SMALL);
1629			    }
1630
1631			req->addr_high = high_swapped;
1632			req->addr_low = htobe32(low);
1633			req->pseudo_hdr_offset = pseudo_hdr_offset;
1634			req->pad = 0;
1635			req->rdma_count = 1;
1636			req->length = htobe16(seglen);
1637			req->cksum_offset = cksum_offset;
1638			req->flags = flags | ((cum_len & 1) *
1639					      MXGEFW_FLAGS_ALIGN_ODD);
1640			low += seglen;
1641			len -= seglen;
1642			cum_len = cum_len_next;
1643			flags = flags_next;
1644			req++;
1645			cnt++;
1646			rdma_count++;
1647			if (__predict_false(cksum_offset > seglen))
1648				cksum_offset -= seglen;
1649			else
1650				cksum_offset = 0;
1651			if (__predict_false(cnt > tx->max_desc))
1652				goto drop;
1653		}
1654		busdma_seg_cnt--;
1655		seg++;
1656	}
1657	(req-rdma_count)->rdma_count = rdma_count;
1658
1659	do {
1660		req--;
1661		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1662	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1663
1664	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1665	mxge_submit_req(tx, tx->req_list, cnt);
1666	return;
1667
1668drop:
1669	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1670	m_freem(m);
1671	sc->ifp->if_oerrors++;
1672	if (!once) {
1673		printf("tx->max_desc exceeded via TSO!\n");
1674		printf("mss = %d, %ld, %d!\n", mss,
1675		       (long)seg - (long)tx->seg_list, tx->max_desc);
1676		once = 1;
1677	}
1678	return;
1679
1680}
1681
1682/*
1683 * We reproduce the software vlan tag insertion from
1684 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1685 * vlan tag insertion. We need to advertise this in order to have the
1686 * vlan interface respect our csum offload flags.
1687 */
1688static struct mbuf *
1689mxge_vlan_tag_insert(struct mbuf *m)
1690{
1691	struct ether_vlan_header *evl;
1692
1693	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1694	if (__predict_false(m == NULL))
1695		return NULL;
1696	if (m->m_len < sizeof(*evl)) {
1697		m = m_pullup(m, sizeof(*evl));
1698		if (__predict_false(m == NULL))
1699			return NULL;
1700	}
1701	/*
1702	 * Transform the Ethernet header into an Ethernet header
1703	 * with 802.1Q encapsulation.
1704	 */
1705	evl = mtod(m, struct ether_vlan_header *);
1706	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1707	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1708	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1709	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1710	m->m_flags &= ~M_VLANTAG;
1711	return m;
1712}
1713
1714static void
1715mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1716{
1717	mcp_kreq_ether_send_t *req;
1718	bus_dma_segment_t *seg;
1719	struct mbuf *m_tmp;
1720	struct ifnet *ifp;
1721	mxge_tx_buf_t *tx;
1722	struct ip *ip;
1723	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1724	uint16_t pseudo_hdr_offset;
1725        uint8_t flags, cksum_offset;
1726
1727
1728
1729	ifp = sc->ifp;
1730	tx = &sc->tx;
1731
1732	ip_off = sizeof (struct ether_header);
1733	if (m->m_flags & M_VLANTAG) {
1734		m = mxge_vlan_tag_insert(m);
1735		if (__predict_false(m == NULL))
1736			goto drop;
1737		ip_off += ETHER_VLAN_ENCAP_LEN;
1738	}
1739
1740	/* (try to) map the frame for DMA */
1741	idx = tx->req & tx->mask;
1742	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1743				      m, tx->seg_list, &cnt,
1744				      BUS_DMA_NOWAIT);
1745	if (__predict_false(err == EFBIG)) {
1746		/* Too many segments in the chain.  Try
1747		   to defrag */
1748		m_tmp = m_defrag(m, M_NOWAIT);
1749		if (m_tmp == NULL) {
1750			goto drop;
1751		}
1752		sc->tx_defrag++;
1753		m = m_tmp;
1754		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1755					      tx->info[idx].map,
1756					      m, tx->seg_list, &cnt,
1757					      BUS_DMA_NOWAIT);
1758	}
1759	if (__predict_false(err != 0)) {
1760		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1761			      " packet len = %d\n", err, m->m_pkthdr.len);
1762		goto drop;
1763	}
1764	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1765			BUS_DMASYNC_PREWRITE);
1766	tx->info[idx].m = m;
1767
1768
1769	/* TSO is different enough, we handle it in another routine */
1770	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1771		mxge_encap_tso(sc, m, cnt, ip_off);
1772		return;
1773	}
1774
1775	req = tx->req_list;
1776	cksum_offset = 0;
1777	pseudo_hdr_offset = 0;
1778	flags = MXGEFW_FLAGS_NO_TSO;
1779
1780	/* checksum offloading? */
1781	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1782		/* ensure ip header is in first mbuf, copy
1783		   it to a scratch buffer if not */
1784		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1785			m_copydata(m, 0, ip_off + sizeof (*ip),
1786				   sc->scratch);
1787			ip = (struct ip *)(sc->scratch + ip_off);
1788		} else {
1789			ip = (struct ip *)(mtod(m, char *) + ip_off);
1790		}
1791		cksum_offset = ip_off + (ip->ip_hl << 2);
1792		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1793		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1794		req->cksum_offset = cksum_offset;
1795		flags |= MXGEFW_FLAGS_CKSUM;
1796		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1797	} else {
1798		odd_flag = 0;
1799	}
1800	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1801		flags |= MXGEFW_FLAGS_SMALL;
1802
1803	/* convert segments into a request list */
1804	cum_len = 0;
1805	seg = tx->seg_list;
1806	req->flags = MXGEFW_FLAGS_FIRST;
1807	for (i = 0; i < cnt; i++) {
1808		req->addr_low =
1809			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1810		req->addr_high =
1811			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1812		req->length = htobe16(seg->ds_len);
1813		req->cksum_offset = cksum_offset;
1814		if (cksum_offset > seg->ds_len)
1815			cksum_offset -= seg->ds_len;
1816		else
1817			cksum_offset = 0;
1818		req->pseudo_hdr_offset = pseudo_hdr_offset;
1819		req->pad = 0; /* complete solid 16-byte block */
1820		req->rdma_count = 1;
1821		req->flags |= flags | ((cum_len & 1) * odd_flag);
1822		cum_len += seg->ds_len;
1823		seg++;
1824		req++;
1825		req->flags = 0;
1826	}
1827	req--;
1828	/* pad runts to 60 bytes */
1829	if (cum_len < 60) {
1830		req++;
1831		req->addr_low =
1832			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1833		req->addr_high =
1834			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1835		req->length = htobe16(60 - cum_len);
1836		req->cksum_offset = 0;
1837		req->pseudo_hdr_offset = pseudo_hdr_offset;
1838		req->pad = 0; /* complete solid 16-byte block */
1839		req->rdma_count = 1;
1840		req->flags |= flags | ((cum_len & 1) * odd_flag);
1841		cnt++;
1842	}
1843
1844	tx->req_list[0].rdma_count = cnt;
1845#if 0
1846	/* print what the firmware will see */
1847	for (i = 0; i < cnt; i++) {
1848		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1849		    "cso:%d, flags:0x%x, rdma:%d\n",
1850		    i, (int)ntohl(tx->req_list[i].addr_high),
1851		    (int)ntohl(tx->req_list[i].addr_low),
1852		    (int)ntohs(tx->req_list[i].length),
1853		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1854		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1855		    tx->req_list[i].rdma_count);
1856	}
1857	printf("--------------\n");
1858#endif
1859	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1860	mxge_submit_req(tx, tx->req_list, cnt);
1861	return;
1862
1863drop:
1864	m_freem(m);
1865	ifp->if_oerrors++;
1866	return;
1867}
1868
1869
1870
1871
1872static inline void
1873mxge_start_locked(mxge_softc_t *sc)
1874{
1875	struct mbuf *m;
1876	struct ifnet *ifp;
1877	mxge_tx_buf_t *tx;
1878
1879	ifp = sc->ifp;
1880	tx = &sc->tx;
1881	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
1882		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1883		if (m == NULL) {
1884			return;
1885		}
1886		/* let BPF see it */
1887		BPF_MTAP(ifp, m);
1888
1889		/* give it to the nic */
1890		mxge_encap(sc, m);
1891	}
1892	/* ran out of transmit slots */
1893	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
1894		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1895		tx->stall++;
1896	}
1897}
1898
1899static void
1900mxge_start(struct ifnet *ifp)
1901{
1902	mxge_softc_t *sc = ifp->if_softc;
1903
1904
1905	mtx_lock(&sc->tx_mtx);
1906	mxge_start_locked(sc);
1907	mtx_unlock(&sc->tx_mtx);
1908}
1909
1910/*
1911 * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1912 * at most 32 bytes at a time, so as to avoid involving the software
1913 * pio handler in the nic.   We re-write the first segment's low
1914 * DMA address to mark it valid only after we write the entire chunk
1915 * in a burst
1916 */
1917static inline void
1918mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1919		mcp_kreq_ether_recv_t *src)
1920{
1921	uint32_t low;
1922
1923	low = src->addr_low;
1924	src->addr_low = 0xffffffff;
1925	mxge_pio_copy(dst, src, 4 * sizeof (*src));
1926	mb();
1927	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
1928	mb();
1929	src->addr_low = low;
1930	dst->addr_low = low;
1931	mb();
1932}
1933
1934static int
1935mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1936{
1937	bus_dma_segment_t seg;
1938	struct mbuf *m;
1939	mxge_rx_buf_t *rx = &sc->rx_small;
1940	int cnt, err;
1941
1942	m = m_gethdr(M_DONTWAIT, MT_DATA);
1943	if (m == NULL) {
1944		rx->alloc_fail++;
1945		err = ENOBUFS;
1946		goto done;
1947	}
1948	m->m_len = MHLEN;
1949	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1950				      &seg, &cnt, BUS_DMA_NOWAIT);
1951	if (err != 0) {
1952		m_free(m);
1953		goto done;
1954	}
1955	rx->info[idx].m = m;
1956	rx->shadow[idx].addr_low =
1957		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1958	rx->shadow[idx].addr_high =
1959		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1960
1961done:
1962	if ((idx & 7) == 7)
1963		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
1964	return err;
1965}
1966
1967static int
1968mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1969{
1970	bus_dma_segment_t seg[3];
1971	struct mbuf *m;
1972	mxge_rx_buf_t *rx = &sc->rx_big;
1973	int cnt, err, i;
1974
1975	if (rx->cl_size == MCLBYTES)
1976		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
1977	else
1978		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
1979	if (m == NULL) {
1980		rx->alloc_fail++;
1981		err = ENOBUFS;
1982		goto done;
1983	}
1984	m->m_len = rx->cl_size;
1985	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1986				      seg, &cnt, BUS_DMA_NOWAIT);
1987	if (err != 0) {
1988		m_free(m);
1989		goto done;
1990	}
1991	rx->info[idx].m = m;
1992
1993	for (i = 0; i < cnt; i++) {
1994		rx->shadow[idx + i].addr_low =
1995			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
1996		rx->shadow[idx + i].addr_high =
1997			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
1998       }
1999
2000
2001done:
2002       for (i = 0; i < rx->nbufs; i++) {
2003		if ((idx & 7) == 7) {
2004			mxge_submit_8rx(&rx->lanai[idx - 7],
2005					&rx->shadow[idx - 7]);
2006		}
2007		idx++;
2008	}
2009	return err;
2010}
2011
2012/*
2013 *  Myri10GE hardware checksums are not valid if the sender
2014 *  padded the frame with non-zero padding.  This is because
2015 *  the firmware just does a simple 16-bit 1s complement
2016 *  checksum across the entire frame, excluding the first 14
2017 *  bytes.  It is best to simply to check the checksum and
2018 *  tell the stack about it only if the checksum is good
2019 */
2020
2021static inline uint16_t
2022mxge_rx_csum(struct mbuf *m, int csum)
2023{
2024	struct ether_header *eh;
2025	struct ip *ip;
2026	uint16_t c;
2027
2028	eh = mtod(m, struct ether_header *);
2029
2030	/* only deal with IPv4 TCP & UDP for now */
2031	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2032		return 1;
2033	ip = (struct ip *)(eh + 1);
2034	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2035			    ip->ip_p != IPPROTO_UDP))
2036		return 1;
2037
2038	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2039		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2040			    - (ip->ip_hl << 2) + ip->ip_p));
2041	c ^= 0xffff;
2042	return (c);
2043}
2044
2045static void
2046mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2047{
2048	struct ether_vlan_header *evl;
2049	struct ether_header *eh;
2050	uint32_t partial;
2051
2052	evl = mtod(m, struct ether_vlan_header *);
2053	eh = mtod(m, struct ether_header *);
2054
2055	/*
2056	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2057	 * after what the firmware thought was the end of the ethernet
2058	 * header.
2059	 */
2060
2061	/* put checksum into host byte order */
2062	*csum = ntohs(*csum);
2063	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2064	(*csum) += ~partial;
2065	(*csum) +=  ((*csum) < ~partial);
2066	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2067	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2068
2069	/* restore checksum to network byte order;
2070	   later consumers expect this */
2071	*csum = htons(*csum);
2072
2073	/* save the tag */
2074	m->m_flags |= M_VLANTAG;
2075	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2076
2077	/*
2078	 * Remove the 802.1q header by copying the Ethernet
2079	 * addresses over it and adjusting the beginning of
2080	 * the data in the mbuf.  The encapsulated Ethernet
2081	 * type field is already in place.
2082	 */
2083	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2084	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2085	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2086}
2087
2088
2089static inline void
2090mxge_rx_done_big(mxge_softc_t *sc, uint32_t len, uint32_t csum)
2091{
2092	struct ifnet *ifp;
2093	struct mbuf *m;
2094	struct ether_header *eh;
2095	mxge_rx_buf_t *rx;
2096	bus_dmamap_t old_map;
2097	int idx;
2098	uint16_t tcpudp_csum;
2099
2100	ifp = sc->ifp;
2101	rx = &sc->rx_big;
2102	idx = rx->cnt & rx->mask;
2103	rx->cnt += rx->nbufs;
2104	/* save a pointer to the received mbuf */
2105	m = rx->info[idx].m;
2106	/* try to replace the received mbuf */
2107	if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
2108		/* drop the frame -- the old mbuf is re-cycled */
2109		ifp->if_ierrors++;
2110		return;
2111	}
2112
2113	/* unmap the received buffer */
2114	old_map = rx->info[idx].map;
2115	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2116	bus_dmamap_unload(rx->dmat, old_map);
2117
2118	/* swap the bus_dmamap_t's */
2119	rx->info[idx].map = rx->extra_map;
2120	rx->extra_map = old_map;
2121
2122	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2123	 * aligned */
2124	m->m_data += MXGEFW_PAD;
2125
2126	m->m_pkthdr.rcvif = ifp;
2127	m->m_len = m->m_pkthdr.len = len;
2128	ifp->if_ipackets++;
2129	eh = mtod(m, struct ether_header *);
2130	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2131		mxge_vlan_tag_remove(m, &csum);
2132	}
2133	/* if the checksum is valid, mark it in the mbuf header */
2134	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2135		if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
2136			return;
2137		/* otherwise, it was a UDP frame, or a TCP frame which
2138		   we could not do LRO on.  Tell the stack that the
2139		   checksum is good */
2140		m->m_pkthdr.csum_data = 0xffff;
2141		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2142	}
2143	/* pass the frame up the stack */
2144	(*ifp->if_input)(ifp, m);
2145}
2146
2147static inline void
2148mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
2149{
2150	struct ifnet *ifp;
2151	struct ether_header *eh;
2152	struct mbuf *m;
2153	mxge_rx_buf_t *rx;
2154	bus_dmamap_t old_map;
2155	int idx;
2156	uint16_t tcpudp_csum;
2157
2158	ifp = sc->ifp;
2159	rx = &sc->rx_small;
2160	idx = rx->cnt & rx->mask;
2161	rx->cnt++;
2162	/* save a pointer to the received mbuf */
2163	m = rx->info[idx].m;
2164	/* try to replace the received mbuf */
2165	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
2166		/* drop the frame -- the old mbuf is re-cycled */
2167		ifp->if_ierrors++;
2168		return;
2169	}
2170
2171	/* unmap the received buffer */
2172	old_map = rx->info[idx].map;
2173	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2174	bus_dmamap_unload(rx->dmat, old_map);
2175
2176	/* swap the bus_dmamap_t's */
2177	rx->info[idx].map = rx->extra_map;
2178	rx->extra_map = old_map;
2179
2180	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2181	 * aligned */
2182	m->m_data += MXGEFW_PAD;
2183
2184	m->m_pkthdr.rcvif = ifp;
2185	m->m_len = m->m_pkthdr.len = len;
2186	ifp->if_ipackets++;
2187	eh = mtod(m, struct ether_header *);
2188	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2189		mxge_vlan_tag_remove(m, &csum);
2190	}
2191	/* if the checksum is valid, mark it in the mbuf header */
2192	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2193		if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
2194			return;
2195		/* otherwise, it was a UDP frame, or a TCP frame which
2196		   we could not do LRO on.  Tell the stack that the
2197		   checksum is good */
2198		m->m_pkthdr.csum_data = 0xffff;
2199		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2200	}
2201
2202	/* pass the frame up the stack */
2203	(*ifp->if_input)(ifp, m);
2204}
2205
2206static inline void
2207mxge_clean_rx_done(mxge_softc_t *sc)
2208{
2209	mxge_rx_done_t *rx_done = &sc->rx_done;
2210	struct lro_entry *lro;
2211	int limit = 0;
2212	uint16_t length;
2213	uint16_t checksum;
2214
2215
2216	while (rx_done->entry[rx_done->idx].length != 0) {
2217		length = ntohs(rx_done->entry[rx_done->idx].length);
2218		rx_done->entry[rx_done->idx].length = 0;
2219		checksum = rx_done->entry[rx_done->idx].checksum;
2220		if (length <= (MHLEN - MXGEFW_PAD))
2221			mxge_rx_done_small(sc, length, checksum);
2222		else
2223			mxge_rx_done_big(sc, length, checksum);
2224		rx_done->cnt++;
2225		rx_done->idx = rx_done->cnt & rx_done->mask;
2226
2227		/* limit potential for livelock */
2228		if (__predict_false(++limit > 2 * rx_done->mask))
2229			break;
2230	}
2231	while(!SLIST_EMPTY(&sc->lro_active)) {
2232		lro = SLIST_FIRST(&sc->lro_active);
2233		SLIST_REMOVE_HEAD(&sc->lro_active, next);
2234		mxge_lro_flush(sc, lro);
2235	}
2236}
2237
2238
2239static inline void
2240mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
2241{
2242	struct ifnet *ifp;
2243	mxge_tx_buf_t *tx;
2244	struct mbuf *m;
2245	bus_dmamap_t map;
2246	int idx, limit;
2247
2248	limit = 0;
2249	tx = &sc->tx;
2250	ifp = sc->ifp;
2251	while (tx->pkt_done != mcp_idx) {
2252		idx = tx->done & tx->mask;
2253		tx->done++;
2254		m = tx->info[idx].m;
2255		/* mbuf and DMA map only attached to the first
2256		   segment per-mbuf */
2257		if (m != NULL) {
2258			ifp->if_opackets++;
2259			tx->info[idx].m = NULL;
2260			map = tx->info[idx].map;
2261			bus_dmamap_unload(tx->dmat, map);
2262			m_freem(m);
2263		}
2264		if (tx->info[idx].flag) {
2265			tx->info[idx].flag = 0;
2266			tx->pkt_done++;
2267		}
2268		/* limit potential for livelock by only handling
2269		   2 full tx rings per call */
2270		if (__predict_false(++limit >  2 * tx->mask))
2271			break;
2272	}
2273
2274	/* If we have space, clear IFF_OACTIVE to tell the stack that
2275           its OK to send packets */
2276
2277	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2278	    tx->req - tx->done < (tx->mask + 1)/4) {
2279		mtx_lock(&sc->tx_mtx);
2280		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2281		sc->tx.wake++;
2282		mxge_start_locked(sc);
2283		mtx_unlock(&sc->tx_mtx);
2284	}
2285}
2286
2287static void
2288mxge_intr(void *arg)
2289{
2290	mxge_softc_t *sc = arg;
2291	mcp_irq_data_t *stats = sc->fw_stats;
2292	mxge_tx_buf_t *tx = &sc->tx;
2293	mxge_rx_done_t *rx_done = &sc->rx_done;
2294	uint32_t send_done_count;
2295	uint8_t valid;
2296
2297
2298	/* make sure the DMA has finished */
2299	if (!stats->valid) {
2300		return;
2301	}
2302	valid = stats->valid;
2303
2304	if (!sc->msi_enabled) {
2305		/* lower legacy IRQ  */
2306		*sc->irq_deassert = 0;
2307		if (!mxge_deassert_wait)
2308			/* don't wait for conf. that irq is low */
2309			stats->valid = 0;
2310	} else {
2311		stats->valid = 0;
2312	}
2313
2314	/* loop while waiting for legacy irq deassertion */
2315	do {
2316		/* check for transmit completes and receives */
2317		send_done_count = be32toh(stats->send_done_count);
2318		while ((send_done_count != tx->pkt_done) ||
2319		       (rx_done->entry[rx_done->idx].length != 0)) {
2320			mxge_tx_done(sc, (int)send_done_count);
2321			mxge_clean_rx_done(sc);
2322			send_done_count = be32toh(stats->send_done_count);
2323		}
2324	} while (*((volatile uint8_t *) &stats->valid));
2325
2326	if (__predict_false(stats->stats_updated)) {
2327		if (sc->link_state != stats->link_up) {
2328			sc->link_state = stats->link_up;
2329			if (sc->link_state) {
2330				if_link_state_change(sc->ifp, LINK_STATE_UP);
2331				if (mxge_verbose)
2332					device_printf(sc->dev, "link up\n");
2333			} else {
2334				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2335				if (mxge_verbose)
2336					device_printf(sc->dev, "link down\n");
2337			}
2338		}
2339		if (sc->rdma_tags_available !=
2340		    be32toh(sc->fw_stats->rdma_tags_available)) {
2341			sc->rdma_tags_available =
2342				be32toh(sc->fw_stats->rdma_tags_available);
2343			device_printf(sc->dev, "RDMA timed out! %d tags "
2344				      "left\n", sc->rdma_tags_available);
2345		}
2346		sc->down_cnt += stats->link_down;
2347	}
2348
2349	/* check to see if we have rx token to pass back */
2350	if (valid & 0x1)
2351	    *sc->irq_claim = be32toh(3);
2352	*(sc->irq_claim + 1) = be32toh(3);
2353}
2354
2355static void
2356mxge_init(void *arg)
2357{
2358}
2359
2360
2361
2362static void
2363mxge_free_mbufs(mxge_softc_t *sc)
2364{
2365	int i;
2366
2367	for (i = 0; i <= sc->rx_big.mask; i++) {
2368		if (sc->rx_big.info[i].m == NULL)
2369			continue;
2370		bus_dmamap_unload(sc->rx_big.dmat,
2371				  sc->rx_big.info[i].map);
2372		m_freem(sc->rx_big.info[i].m);
2373		sc->rx_big.info[i].m = NULL;
2374	}
2375
2376	for (i = 0; i <= sc->rx_small.mask; i++) {
2377		if (sc->rx_small.info[i].m == NULL)
2378			continue;
2379		bus_dmamap_unload(sc->rx_small.dmat,
2380				  sc->rx_small.info[i].map);
2381		m_freem(sc->rx_small.info[i].m);
2382		sc->rx_small.info[i].m = NULL;
2383	}
2384
2385	for (i = 0; i <= sc->tx.mask; i++) {
2386		sc->tx.info[i].flag = 0;
2387		if (sc->tx.info[i].m == NULL)
2388			continue;
2389		bus_dmamap_unload(sc->tx.dmat,
2390				  sc->tx.info[i].map);
2391		m_freem(sc->tx.info[i].m);
2392		sc->tx.info[i].m = NULL;
2393	}
2394}
2395
2396static void
2397mxge_free_rings(mxge_softc_t *sc)
2398{
2399	int i;
2400
2401	if (sc->rx_done.entry != NULL)
2402		mxge_dma_free(&sc->rx_done.dma);
2403	sc->rx_done.entry = NULL;
2404	if (sc->tx.req_bytes != NULL)
2405		free(sc->tx.req_bytes, M_DEVBUF);
2406	if (sc->tx.seg_list != NULL)
2407		free(sc->tx.seg_list, M_DEVBUF);
2408	if (sc->rx_small.shadow != NULL)
2409		free(sc->rx_small.shadow, M_DEVBUF);
2410	if (sc->rx_big.shadow != NULL)
2411		free(sc->rx_big.shadow, M_DEVBUF);
2412	if (sc->tx.info != NULL) {
2413		if (sc->tx.dmat != NULL) {
2414			for (i = 0; i <= sc->tx.mask; i++) {
2415				bus_dmamap_destroy(sc->tx.dmat,
2416						   sc->tx.info[i].map);
2417			}
2418			bus_dma_tag_destroy(sc->tx.dmat);
2419		}
2420		free(sc->tx.info, M_DEVBUF);
2421	}
2422	if (sc->rx_small.info != NULL) {
2423		if (sc->rx_small.dmat != NULL) {
2424			for (i = 0; i <= sc->rx_small.mask; i++) {
2425				bus_dmamap_destroy(sc->rx_small.dmat,
2426						   sc->rx_small.info[i].map);
2427			}
2428			bus_dmamap_destroy(sc->rx_small.dmat,
2429					   sc->rx_small.extra_map);
2430			bus_dma_tag_destroy(sc->rx_small.dmat);
2431		}
2432		free(sc->rx_small.info, M_DEVBUF);
2433	}
2434	if (sc->rx_big.info != NULL) {
2435		if (sc->rx_big.dmat != NULL) {
2436			for (i = 0; i <= sc->rx_big.mask; i++) {
2437				bus_dmamap_destroy(sc->rx_big.dmat,
2438						   sc->rx_big.info[i].map);
2439			}
2440			bus_dmamap_destroy(sc->rx_big.dmat,
2441					   sc->rx_big.extra_map);
2442			bus_dma_tag_destroy(sc->rx_big.dmat);
2443		}
2444		free(sc->rx_big.info, M_DEVBUF);
2445	}
2446}
2447
2448static int
2449mxge_alloc_rings(mxge_softc_t *sc)
2450{
2451	mxge_cmd_t cmd;
2452	int tx_ring_size, rx_ring_size;
2453	int tx_ring_entries, rx_ring_entries;
2454	int i, err;
2455	unsigned long bytes;
2456
2457	/* get ring sizes */
2458	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2459	tx_ring_size = cmd.data0;
2460	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
2461	if (err != 0) {
2462		device_printf(sc->dev, "Cannot determine ring sizes\n");
2463		goto abort_with_nothing;
2464	}
2465
2466	rx_ring_size = cmd.data0;
2467
2468	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
2469	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
2470	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
2471	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
2472	IFQ_SET_READY(&sc->ifp->if_snd);
2473
2474	sc->tx.mask = tx_ring_entries - 1;
2475	sc->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
2476	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
2477	sc->rx_done.mask = (2 * rx_ring_entries) - 1;
2478
2479	err = ENOMEM;
2480
2481	/* allocate interrupt queues */
2482	bytes = (sc->rx_done.mask + 1) * sizeof (*sc->rx_done.entry);
2483	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
2484	if (err != 0)
2485		goto abort_with_nothing;
2486	sc->rx_done.entry = sc->rx_done.dma.addr;
2487	bzero(sc->rx_done.entry, bytes);
2488
2489	/* allocate the tx request copy block */
2490	bytes = 8 +
2491		sizeof (*sc->tx.req_list) * (sc->tx.max_desc + 4);
2492	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2493	if (sc->tx.req_bytes == NULL)
2494		goto abort_with_alloc;
2495	/* ensure req_list entries are aligned to 8 bytes */
2496	sc->tx.req_list = (mcp_kreq_ether_send_t *)
2497		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
2498
2499	/* allocate the tx busdma segment list */
2500	bytes = sizeof (*sc->tx.seg_list) * sc->tx.max_desc;
2501	sc->tx.seg_list = (bus_dma_segment_t *)
2502		malloc(bytes, M_DEVBUF, M_WAITOK);
2503	if (sc->tx.seg_list == NULL)
2504		goto abort_with_alloc;
2505
2506	/* allocate the rx shadow rings */
2507	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
2508	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2509	if (sc->rx_small.shadow == NULL)
2510		goto abort_with_alloc;
2511
2512	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
2513	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2514	if (sc->rx_big.shadow == NULL)
2515		goto abort_with_alloc;
2516
2517	/* allocate the host info rings */
2518	bytes = tx_ring_entries * sizeof (*sc->tx.info);
2519	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2520	if (sc->tx.info == NULL)
2521		goto abort_with_alloc;
2522
2523	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
2524	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2525	if (sc->rx_small.info == NULL)
2526		goto abort_with_alloc;
2527
2528	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
2529	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2530	if (sc->rx_big.info == NULL)
2531		goto abort_with_alloc;
2532
2533	/* allocate the busdma resources */
2534	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2535				 1,			/* alignment */
2536				 sc->tx.boundary,	/* boundary */
2537				 BUS_SPACE_MAXADDR,	/* low */
2538				 BUS_SPACE_MAXADDR,	/* high */
2539				 NULL, NULL,		/* filter */
2540				 65536 + 256,		/* maxsize */
2541				 sc->tx.max_desc - 2,	/* num segs */
2542				 sc->tx.boundary,	/* maxsegsize */
2543				 BUS_DMA_ALLOCNOW,	/* flags */
2544				 NULL, NULL,		/* lock */
2545				 &sc->tx.dmat);		/* tag */
2546
2547	if (err != 0) {
2548		device_printf(sc->dev, "Err %d allocating tx dmat\n",
2549			      err);
2550		goto abort_with_alloc;
2551	}
2552
2553	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2554				 1,			/* alignment */
2555				 4096,			/* boundary */
2556				 BUS_SPACE_MAXADDR,	/* low */
2557				 BUS_SPACE_MAXADDR,	/* high */
2558				 NULL, NULL,		/* filter */
2559				 MHLEN,			/* maxsize */
2560				 1,			/* num segs */
2561				 MHLEN,			/* maxsegsize */
2562				 BUS_DMA_ALLOCNOW,	/* flags */
2563				 NULL, NULL,		/* lock */
2564				 &sc->rx_small.dmat);	/* tag */
2565	if (err != 0) {
2566		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2567			      err);
2568		goto abort_with_alloc;
2569	}
2570
2571	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2572				 1,			/* alignment */
2573				 4096,			/* boundary */
2574				 BUS_SPACE_MAXADDR,	/* low */
2575				 BUS_SPACE_MAXADDR,	/* high */
2576				 NULL, NULL,		/* filter */
2577				 3*4096,		/* maxsize */
2578				 3,			/* num segs */
2579				 4096,			/* maxsegsize */
2580				 BUS_DMA_ALLOCNOW,	/* flags */
2581				 NULL, NULL,		/* lock */
2582				 &sc->rx_big.dmat);	/* tag */
2583	if (err != 0) {
2584		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2585			      err);
2586		goto abort_with_alloc;
2587	}
2588
2589	/* now use these tags to setup dmamaps for each slot
2590	   in each ring */
2591	for (i = 0; i <= sc->tx.mask; i++) {
2592		err = bus_dmamap_create(sc->tx.dmat, 0,
2593					&sc->tx.info[i].map);
2594		if (err != 0) {
2595			device_printf(sc->dev, "Err %d  tx dmamap\n",
2596			      err);
2597			goto abort_with_alloc;
2598		}
2599	}
2600	for (i = 0; i <= sc->rx_small.mask; i++) {
2601		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2602					&sc->rx_small.info[i].map);
2603		if (err != 0) {
2604			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2605				      err);
2606			goto abort_with_alloc;
2607		}
2608	}
2609	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2610				&sc->rx_small.extra_map);
2611	if (err != 0) {
2612		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2613			      err);
2614			goto abort_with_alloc;
2615	}
2616
2617	for (i = 0; i <= sc->rx_big.mask; i++) {
2618		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2619					&sc->rx_big.info[i].map);
2620		if (err != 0) {
2621			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2622			      err);
2623			goto abort_with_alloc;
2624		}
2625	}
2626	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2627				&sc->rx_big.extra_map);
2628	if (err != 0) {
2629		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2630			      err);
2631			goto abort_with_alloc;
2632	}
2633	return 0;
2634
2635abort_with_alloc:
2636	mxge_free_rings(sc);
2637
2638abort_with_nothing:
2639	return err;
2640}
2641
2642static void
2643mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
2644{
2645	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
2646
2647	if (bufsize < MCLBYTES) {
2648		/* easy, everything fits in a single buffer */
2649		*big_buf_size = MCLBYTES;
2650		*cl_size = MCLBYTES;
2651		*nbufs = 1;
2652		return;
2653	}
2654
2655	if (bufsize < MJUMPAGESIZE) {
2656		/* still easy, everything still fits in a single buffer */
2657		*big_buf_size = MJUMPAGESIZE;
2658		*cl_size = MJUMPAGESIZE;
2659		*nbufs = 1;
2660		return;
2661	}
2662	/* now we need to use virtually contiguous buffers */
2663	*cl_size = MJUM9BYTES;
2664	*big_buf_size = 4096;
2665	*nbufs = mtu / 4096 + 1;
2666	/* needs to be a power of two, so round up */
2667	if (*nbufs == 3)
2668		*nbufs = 4;
2669}
2670
2671static int
2672mxge_open(mxge_softc_t *sc)
2673{
2674	mxge_cmd_t cmd;
2675	int i, err, big_bytes;
2676	bus_dmamap_t map;
2677	bus_addr_t bus;
2678	struct lro_entry *lro_entry;
2679
2680	SLIST_INIT(&sc->lro_free);
2681	SLIST_INIT(&sc->lro_active);
2682
2683	for (i = 0; i < sc->lro_cnt; i++) {
2684		lro_entry = (struct lro_entry *)
2685			malloc(sizeof (*lro_entry), M_DEVBUF, M_NOWAIT | M_ZERO);
2686		if (lro_entry == NULL) {
2687			sc->lro_cnt = i;
2688			break;
2689		}
2690		SLIST_INSERT_HEAD(&sc->lro_free, lro_entry, next);
2691	}
2692
2693	/* Copy the MAC address in case it was overridden */
2694	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
2695
2696	err = mxge_reset(sc, 1);
2697	if (err != 0) {
2698		device_printf(sc->dev, "failed to reset\n");
2699		return EIO;
2700	}
2701
2702	mxge_choose_params(sc->ifp->if_mtu, &big_bytes,
2703			   &sc->rx_big.cl_size, &sc->rx_big.nbufs);
2704
2705	cmd.data0 = sc->rx_big.nbufs;
2706	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
2707			    &cmd);
2708	/* error is only meaningful if we're trying to set
2709	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
2710	if (err && sc->rx_big.nbufs > 1) {
2711		device_printf(sc->dev,
2712			      "Failed to set alway-use-n to %d\n",
2713			      sc->rx_big.nbufs);
2714		return EIO;
2715	}
2716	/* get the lanai pointers to the send and receive rings */
2717
2718	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2719	sc->tx.lanai =
2720		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2721	err |= mxge_send_cmd(sc,
2722				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2723	sc->rx_small.lanai =
2724		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2725	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2726	sc->rx_big.lanai =
2727		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2728
2729	if (err != 0) {
2730		device_printf(sc->dev,
2731			      "failed to get ring sizes or locations\n");
2732		return EIO;
2733	}
2734
2735	/* stock receive rings */
2736	for (i = 0; i <= sc->rx_small.mask; i++) {
2737		map = sc->rx_small.info[i].map;
2738		err = mxge_get_buf_small(sc, map, i);
2739		if (err) {
2740			device_printf(sc->dev, "alloced %d/%d smalls\n",
2741				      i, sc->rx_small.mask + 1);
2742			goto abort;
2743		}
2744	}
2745	for (i = 0; i <= sc->rx_big.mask; i++) {
2746		sc->rx_big.shadow[i].addr_low = 0xffffffff;
2747		sc->rx_big.shadow[i].addr_high = 0xffffffff;
2748	}
2749	for (i = 0; i <= sc->rx_big.mask; i += sc->rx_big.nbufs) {
2750		map = sc->rx_big.info[i].map;
2751		err = mxge_get_buf_big(sc, map, i);
2752		if (err) {
2753			device_printf(sc->dev, "alloced %d/%d bigs\n",
2754				      i, sc->rx_big.mask + 1);
2755			goto abort;
2756		}
2757	}
2758
2759	/* Give the firmware the mtu and the big and small buffer
2760	   sizes.  The firmware wants the big buf size to be a power
2761	   of two. Luckily, FreeBSD's clusters are powers of two */
2762	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
2763	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2764	cmd.data0 = MHLEN - MXGEFW_PAD;
2765	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2766			     &cmd);
2767	cmd.data0 = big_bytes;
2768	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2769
2770	if (err != 0) {
2771		device_printf(sc->dev, "failed to setup params\n");
2772		goto abort;
2773	}
2774
2775	/* Now give him the pointer to the stats block */
2776	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
2777	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
2778	cmd.data2 = sizeof(struct mcp_irq_data);
2779	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
2780
2781	if (err != 0) {
2782		bus = sc->fw_stats_dma.bus_addr;
2783		bus += offsetof(struct mcp_irq_data, send_done_count);
2784		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
2785		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
2786		err = mxge_send_cmd(sc,
2787				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
2788				    &cmd);
2789		/* Firmware cannot support multicast without STATS_DMA_V2 */
2790		sc->fw_multicast_support = 0;
2791	} else {
2792		sc->fw_multicast_support = 1;
2793	}
2794
2795	if (err != 0) {
2796		device_printf(sc->dev, "failed to setup params\n");
2797		goto abort;
2798	}
2799
2800	/* Finally, start the firmware running */
2801	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
2802	if (err) {
2803		device_printf(sc->dev, "Couldn't bring up link\n");
2804		goto abort;
2805	}
2806	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
2807	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2808
2809	return 0;
2810
2811
2812abort:
2813	mxge_free_mbufs(sc);
2814
2815	return err;
2816}
2817
2818static int
2819mxge_close(mxge_softc_t *sc)
2820{
2821	struct lro_entry *lro_entry;
2822	mxge_cmd_t cmd;
2823	int err, old_down_cnt;
2824
2825	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2826	old_down_cnt = sc->down_cnt;
2827	mb();
2828	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2829	if (err) {
2830		device_printf(sc->dev, "Couldn't bring down link\n");
2831	}
2832	if (old_down_cnt == sc->down_cnt) {
2833		/* wait for down irq */
2834		DELAY(10 * sc->intr_coal_delay);
2835	}
2836	if (old_down_cnt == sc->down_cnt) {
2837		device_printf(sc->dev, "never got down irq\n");
2838	}
2839
2840	mxge_free_mbufs(sc);
2841
2842	while (!SLIST_EMPTY(&sc->lro_free)) {
2843		lro_entry = SLIST_FIRST(&sc->lro_free);
2844		SLIST_REMOVE_HEAD(&sc->lro_free, next);
2845	}
2846	return 0;
2847}
2848
2849static void
2850mxge_setup_cfg_space(mxge_softc_t *sc)
2851{
2852	device_t dev = sc->dev;
2853	int reg;
2854	uint16_t cmd, lnk, pectl;
2855
2856	/* find the PCIe link width and set max read request to 4KB*/
2857	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
2858		lnk = pci_read_config(dev, reg + 0x12, 2);
2859		sc->link_width = (lnk >> 4) & 0x3f;
2860
2861		pectl = pci_read_config(dev, reg + 0x8, 2);
2862		pectl = (pectl & ~0x7000) | (5 << 12);
2863		pci_write_config(dev, reg + 0x8, pectl, 2);
2864	}
2865
2866	/* Enable DMA and Memory space access */
2867	pci_enable_busmaster(dev);
2868	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
2869	cmd |= PCIM_CMD_MEMEN;
2870	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
2871}
2872
2873static uint32_t
2874mxge_read_reboot(mxge_softc_t *sc)
2875{
2876	device_t dev = sc->dev;
2877	uint32_t vs;
2878
2879	/* find the vendor specific offset */
2880	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
2881		device_printf(sc->dev,
2882			      "could not find vendor specific offset\n");
2883		return (uint32_t)-1;
2884	}
2885	/* enable read32 mode */
2886	pci_write_config(dev, vs + 0x10, 0x3, 1);
2887	/* tell NIC which register to read */
2888	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
2889	return (pci_read_config(dev, vs + 0x14, 4));
2890}
2891
2892static void
2893mxge_watchdog_reset(mxge_softc_t *sc)
2894{
2895	int err;
2896	uint32_t reboot;
2897	uint16_t cmd;
2898
2899	err = ENXIO;
2900
2901	device_printf(sc->dev, "Watchdog reset!\n");
2902
2903	/*
2904	 * check to see if the NIC rebooted.  If it did, then all of
2905	 * PCI config space has been reset, and things like the
2906	 * busmaster bit will be zero.  If this is the case, then we
2907	 * must restore PCI config space before the NIC can be used
2908	 * again
2909	 */
2910	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2911	if (cmd == 0xffff) {
2912		/*
2913		 * maybe the watchdog caught the NIC rebooting; wait
2914		 * up to 100ms for it to finish.  If it does not come
2915		 * back, then give up
2916		 */
2917		DELAY(1000*100);
2918		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2919		if (cmd == 0xffff) {
2920			device_printf(sc->dev, "NIC disappeared!\n");
2921			goto abort;
2922		}
2923	}
2924	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
2925		/* print the reboot status */
2926		reboot = mxge_read_reboot(sc);
2927		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
2928			      reboot);
2929		/* restore PCI configuration space */
2930
2931		/* XXXX waiting for pci_cfg_restore() to be exported */
2932		goto abort; /* just abort for now */
2933
2934		/* and redo any changes we made to our config space */
2935		mxge_setup_cfg_space(sc);
2936	} else {
2937		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
2938		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
2939			      sc->tx.req, sc->tx.done);
2940		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
2941			      sc->tx.pkt_done,
2942			      be32toh(sc->fw_stats->send_done_count));
2943	}
2944
2945	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
2946		mxge_close(sc);
2947		err = mxge_open(sc);
2948	}
2949
2950abort:
2951	/*
2952	 * stop the watchdog if the nic is dead, to avoid spamming the
2953	 * console
2954	 */
2955	if (err != 0) {
2956		callout_stop(&sc->co_hdl);
2957	}
2958}
2959
2960static void
2961mxge_watchdog(mxge_softc_t *sc)
2962{
2963	mxge_tx_buf_t *tx = &sc->tx;
2964
2965	/* see if we have outstanding transmits, which
2966	   have been pending for more than mxge_ticks */
2967	if (tx->req != tx->done &&
2968	    tx->watchdog_req != tx->watchdog_done &&
2969	    tx->done == tx->watchdog_done)
2970		mxge_watchdog_reset(sc);
2971
2972	tx->watchdog_req = tx->req;
2973	tx->watchdog_done = tx->done;
2974}
2975
2976static void
2977mxge_tick(void *arg)
2978{
2979	mxge_softc_t *sc = arg;
2980
2981
2982	/* Synchronize with possible callout reset/stop. */
2983	if (callout_pending(&sc->co_hdl) ||
2984	    !callout_active(&sc->co_hdl)) {
2985		mtx_unlock(&sc->driver_mtx);
2986		return;
2987	}
2988
2989	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2990	mxge_watchdog(sc);
2991}
2992
2993static int
2994mxge_media_change(struct ifnet *ifp)
2995{
2996	return EINVAL;
2997}
2998
2999static int
3000mxge_change_mtu(mxge_softc_t *sc, int mtu)
3001{
3002	struct ifnet *ifp = sc->ifp;
3003	int real_mtu, old_mtu;
3004	int err = 0;
3005
3006
3007	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3008	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3009		return EINVAL;
3010	mtx_lock(&sc->driver_mtx);
3011	old_mtu = ifp->if_mtu;
3012	ifp->if_mtu = mtu;
3013	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3014		callout_stop(&sc->co_hdl);
3015		mxge_close(sc);
3016		err = mxge_open(sc);
3017		if (err != 0) {
3018			ifp->if_mtu = old_mtu;
3019			mxge_close(sc);
3020			(void) mxge_open(sc);
3021		}
3022		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3023	}
3024	mtx_unlock(&sc->driver_mtx);
3025	return err;
3026}
3027
3028static void
3029mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3030{
3031	mxge_softc_t *sc = ifp->if_softc;
3032
3033
3034	if (sc == NULL)
3035		return;
3036	ifmr->ifm_status = IFM_AVALID;
3037	ifmr->ifm_status |= sc->fw_stats->link_up ? IFM_ACTIVE : 0;
3038	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3039	ifmr->ifm_active |= sc->fw_stats->link_up ? IFM_FDX : 0;
3040}
3041
3042static int
3043mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3044{
3045	mxge_softc_t *sc = ifp->if_softc;
3046	struct ifreq *ifr = (struct ifreq *)data;
3047	int err, mask;
3048
3049	err = 0;
3050	switch (command) {
3051	case SIOCSIFADDR:
3052	case SIOCGIFADDR:
3053		err = ether_ioctl(ifp, command, data);
3054		break;
3055
3056	case SIOCSIFMTU:
3057		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3058		break;
3059
3060	case SIOCSIFFLAGS:
3061		mtx_lock(&sc->driver_mtx);
3062		if (ifp->if_flags & IFF_UP) {
3063			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3064				err = mxge_open(sc);
3065				callout_reset(&sc->co_hdl, mxge_ticks,
3066					      mxge_tick, sc);
3067			} else {
3068				/* take care of promis can allmulti
3069				   flag chages */
3070				mxge_change_promisc(sc,
3071						    ifp->if_flags & IFF_PROMISC);
3072				mxge_set_multicast_list(sc);
3073			}
3074		} else {
3075			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3076				mxge_close(sc);
3077				callout_stop(&sc->co_hdl);
3078			}
3079		}
3080		mtx_unlock(&sc->driver_mtx);
3081		break;
3082
3083	case SIOCADDMULTI:
3084	case SIOCDELMULTI:
3085		mtx_lock(&sc->driver_mtx);
3086		mxge_set_multicast_list(sc);
3087		mtx_unlock(&sc->driver_mtx);
3088		break;
3089
3090	case SIOCSIFCAP:
3091		mtx_lock(&sc->driver_mtx);
3092		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3093		if (mask & IFCAP_TXCSUM) {
3094			if (IFCAP_TXCSUM & ifp->if_capenable) {
3095				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3096				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3097						      | CSUM_TSO);
3098			} else {
3099				ifp->if_capenable |= IFCAP_TXCSUM;
3100				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3101			}
3102		} else if (mask & IFCAP_RXCSUM) {
3103			if (IFCAP_RXCSUM & ifp->if_capenable) {
3104				ifp->if_capenable &= ~IFCAP_RXCSUM;
3105				sc->csum_flag = 0;
3106			} else {
3107				ifp->if_capenable |= IFCAP_RXCSUM;
3108				sc->csum_flag = 1;
3109			}
3110		}
3111		if (mask & IFCAP_TSO4) {
3112			if (IFCAP_TSO4 & ifp->if_capenable) {
3113				ifp->if_capenable &= ~IFCAP_TSO4;
3114				ifp->if_hwassist &= ~CSUM_TSO;
3115			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
3116				ifp->if_capenable |= IFCAP_TSO4;
3117				ifp->if_hwassist |= CSUM_TSO;
3118			} else {
3119				printf("mxge requires tx checksum offload"
3120				       " be enabled to use TSO\n");
3121				err = EINVAL;
3122			}
3123		}
3124
3125		if (mask & IFCAP_VLAN_HWTAGGING)
3126			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3127		mtx_unlock(&sc->driver_mtx);
3128		VLAN_CAPABILITIES(ifp);
3129
3130		break;
3131
3132	case SIOCGIFMEDIA:
3133		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3134				    &sc->media, command);
3135                break;
3136
3137	default:
3138		err = ENOTTY;
3139        }
3140	return err;
3141}
3142
3143static void
3144mxge_fetch_tunables(mxge_softc_t *sc)
3145{
3146
3147	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3148			  &mxge_flow_control);
3149	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3150			  &mxge_intr_coal_delay);
3151	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
3152			  &mxge_nvidia_ecrc_enable);
3153	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
3154			  &mxge_force_firmware);
3155	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
3156			  &mxge_deassert_wait);
3157	TUNABLE_INT_FETCH("hw.mxge.verbose",
3158			  &mxge_verbose);
3159	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
3160	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
3161
3162	if (bootverbose)
3163		mxge_verbose = 1;
3164	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
3165		mxge_intr_coal_delay = 30;
3166	if (mxge_ticks == 0)
3167		mxge_ticks = hz;
3168	sc->pause = mxge_flow_control;
3169
3170}
3171
3172static int
3173mxge_attach(device_t dev)
3174{
3175	mxge_softc_t *sc = device_get_softc(dev);
3176	struct ifnet *ifp;
3177	int count, rid, err;
3178
3179	sc->dev = dev;
3180	mxge_fetch_tunables(sc);
3181
3182	err = bus_dma_tag_create(NULL,			/* parent */
3183				 1,			/* alignment */
3184				 4096,			/* boundary */
3185				 BUS_SPACE_MAXADDR,	/* low */
3186				 BUS_SPACE_MAXADDR,	/* high */
3187				 NULL, NULL,		/* filter */
3188				 65536 + 256,		/* maxsize */
3189				 MXGE_MAX_SEND_DESC, 	/* num segs */
3190				 4096,			/* maxsegsize */
3191				 0,			/* flags */
3192				 NULL, NULL,		/* lock */
3193				 &sc->parent_dmat);	/* tag */
3194
3195	if (err != 0) {
3196		device_printf(sc->dev, "Err %d allocating parent dmat\n",
3197			      err);
3198		goto abort_with_nothing;
3199	}
3200
3201	ifp = sc->ifp = if_alloc(IFT_ETHER);
3202	if (ifp == NULL) {
3203		device_printf(dev, "can not if_alloc()\n");
3204		err = ENOSPC;
3205		goto abort_with_parent_dmat;
3206	}
3207	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
3208		 device_get_nameunit(dev));
3209	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
3210	snprintf(sc->tx_mtx_name, sizeof(sc->tx_mtx_name), "%s:tx",
3211		 device_get_nameunit(dev));
3212	mtx_init(&sc->tx_mtx, sc->tx_mtx_name, NULL, MTX_DEF);
3213	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
3214		 "%s:drv", device_get_nameunit(dev));
3215	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
3216		 MTX_NETWORK_LOCK, MTX_DEF);
3217
3218	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
3219
3220	mxge_setup_cfg_space(sc);
3221
3222	/* Map the board into the kernel */
3223	rid = PCIR_BARS;
3224	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
3225					 ~0, 1, RF_ACTIVE);
3226	if (sc->mem_res == NULL) {
3227		device_printf(dev, "could not map memory\n");
3228		err = ENXIO;
3229		goto abort_with_lock;
3230	}
3231	sc->sram = rman_get_virtual(sc->mem_res);
3232	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
3233	if (sc->sram_size > rman_get_size(sc->mem_res)) {
3234		device_printf(dev, "impossible memory region size %ld\n",
3235			      rman_get_size(sc->mem_res));
3236		err = ENXIO;
3237		goto abort_with_mem_res;
3238	}
3239
3240	/* make NULL terminated copy of the EEPROM strings section of
3241	   lanai SRAM */
3242	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
3243	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
3244				rman_get_bushandle(sc->mem_res),
3245				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
3246				sc->eeprom_strings,
3247				MXGE_EEPROM_STRINGS_SIZE - 2);
3248	err = mxge_parse_strings(sc);
3249	if (err != 0)
3250		goto abort_with_mem_res;
3251
3252	/* Enable write combining for efficient use of PCIe bus */
3253	mxge_enable_wc(sc);
3254
3255	/* Allocate the out of band dma memory */
3256	err = mxge_dma_alloc(sc, &sc->cmd_dma,
3257			     sizeof (mxge_cmd_t), 64);
3258	if (err != 0)
3259		goto abort_with_mem_res;
3260	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
3261	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
3262	if (err != 0)
3263		goto abort_with_cmd_dma;
3264
3265	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
3266			     sizeof (*sc->fw_stats), 64);
3267	if (err != 0)
3268		goto abort_with_zeropad_dma;
3269	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
3270
3271	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
3272	if (err != 0)
3273		goto abort_with_fw_stats;
3274
3275	/* Add our ithread  */
3276	count = pci_msi_count(dev);
3277	if (count == 1 && pci_alloc_msi(dev, &count) == 0) {
3278		rid = 1;
3279		sc->msi_enabled = 1;
3280	} else {
3281		rid = 0;
3282	}
3283	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
3284					 1, RF_SHAREABLE | RF_ACTIVE);
3285	if (sc->irq_res == NULL) {
3286		device_printf(dev, "could not alloc interrupt\n");
3287		goto abort_with_dmabench;
3288	}
3289	if (mxge_verbose)
3290		device_printf(dev, "using %s irq %ld\n",
3291			      sc->msi_enabled ? "MSI" : "INTx",
3292			      rman_get_start(sc->irq_res));
3293	/* select & load the firmware */
3294	err = mxge_select_firmware(sc);
3295	if (err != 0)
3296		goto abort_with_irq_res;
3297	sc->intr_coal_delay = mxge_intr_coal_delay;
3298	err = mxge_reset(sc, 0);
3299	if (err != 0)
3300		goto abort_with_irq_res;
3301
3302	err = mxge_alloc_rings(sc);
3303	if (err != 0) {
3304		device_printf(sc->dev, "failed to allocate rings\n");
3305		goto abort_with_irq_res;
3306	}
3307
3308	err = bus_setup_intr(sc->dev, sc->irq_res,
3309			     INTR_TYPE_NET | INTR_MPSAFE,
3310			     NULL, mxge_intr, sc, &sc->ih);
3311	if (err != 0) {
3312		goto abort_with_rings;
3313	}
3314	/* hook into the network stack */
3315	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
3316	ifp->if_baudrate = 100000000;
3317	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
3318		IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
3319
3320	sc->max_mtu = mxge_max_mtu(sc);
3321	if (sc->max_mtu >= 9000)
3322		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
3323	else
3324		device_printf(dev, "MTU limited to %d.  Install "
3325			      "latest firmware for 9000 byte jumbo support\n",
3326			      sc->max_mtu - ETHER_HDR_LEN);
3327	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
3328	ifp->if_capenable = ifp->if_capabilities;
3329	sc->csum_flag = 1;
3330        ifp->if_init = mxge_init;
3331        ifp->if_softc = sc;
3332        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
3333        ifp->if_ioctl = mxge_ioctl;
3334        ifp->if_start = mxge_start;
3335	ether_ifattach(ifp, sc->mac_addr);
3336	/* ether_ifattach sets mtu to 1500 */
3337	if (ifp->if_capabilities & IFCAP_JUMBO_MTU)
3338		ifp->if_mtu = 9000;
3339
3340	/* Initialise the ifmedia structure */
3341	ifmedia_init(&sc->media, 0, mxge_media_change,
3342		     mxge_media_status);
3343	ifmedia_add(&sc->media, IFM_ETHER|IFM_AUTO, 0, NULL);
3344	mxge_add_sysctls(sc);
3345	return 0;
3346
3347abort_with_rings:
3348	mxge_free_rings(sc);
3349abort_with_irq_res:
3350	bus_release_resource(dev, SYS_RES_IRQ,
3351			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3352	if (sc->msi_enabled)
3353		pci_release_msi(dev);
3354abort_with_dmabench:
3355	mxge_dma_free(&sc->dmabench_dma);
3356abort_with_fw_stats:
3357	mxge_dma_free(&sc->fw_stats_dma);
3358abort_with_zeropad_dma:
3359	mxge_dma_free(&sc->zeropad_dma);
3360abort_with_cmd_dma:
3361	mxge_dma_free(&sc->cmd_dma);
3362abort_with_mem_res:
3363	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3364abort_with_lock:
3365	pci_disable_busmaster(dev);
3366	mtx_destroy(&sc->cmd_mtx);
3367	mtx_destroy(&sc->tx_mtx);
3368	mtx_destroy(&sc->driver_mtx);
3369	if_free(ifp);
3370abort_with_parent_dmat:
3371	bus_dma_tag_destroy(sc->parent_dmat);
3372
3373abort_with_nothing:
3374	return err;
3375}
3376
3377static int
3378mxge_detach(device_t dev)
3379{
3380	mxge_softc_t *sc = device_get_softc(dev);
3381
3382	if (sc->ifp->if_vlantrunk != NULL) {
3383		device_printf(sc->dev,
3384			      "Detach vlans before removing module\n");
3385		return EBUSY;
3386	}
3387	mtx_lock(&sc->driver_mtx);
3388	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
3389		mxge_close(sc);
3390	callout_stop(&sc->co_hdl);
3391	mtx_unlock(&sc->driver_mtx);
3392	ether_ifdetach(sc->ifp);
3393	ifmedia_removeall(&sc->media);
3394	mxge_dummy_rdma(sc, 0);
3395	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
3396	mxge_free_rings(sc);
3397	bus_release_resource(dev, SYS_RES_IRQ,
3398			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3399	if (sc->msi_enabled)
3400		pci_release_msi(dev);
3401
3402	sc->rx_done.entry = NULL;
3403	mxge_dma_free(&sc->rx_done.dma);
3404	mxge_dma_free(&sc->fw_stats_dma);
3405	mxge_dma_free(&sc->dmabench_dma);
3406	mxge_dma_free(&sc->zeropad_dma);
3407	mxge_dma_free(&sc->cmd_dma);
3408	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3409	pci_disable_busmaster(dev);
3410	mtx_destroy(&sc->cmd_mtx);
3411	mtx_destroy(&sc->tx_mtx);
3412	mtx_destroy(&sc->driver_mtx);
3413	if_free(sc->ifp);
3414	bus_dma_tag_destroy(sc->parent_dmat);
3415	return 0;
3416}
3417
3418static int
3419mxge_shutdown(device_t dev)
3420{
3421	return 0;
3422}
3423
3424/*
3425  This file uses Myri10GE driver indentation.
3426
3427  Local Variables:
3428  c-file-style:"linux"
3429  tab-width:8
3430  End:
3431*/
3432