if_mxge.c revision 163467
1/******************************************************************************
2
3Copyright (c) 2006, Myricom Inc.
4All rights reserved.
5
6Redistribution and use in source and binary forms, with or without
7modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10    this list of conditions and the following disclaimer.
11
12 2. Redistributions in binary form must reproduce the above copyright
13    notice, this list of conditions and the following disclaimer in the
14    documentation and/or other materials provided with the distribution.
15
16 3. Neither the name of the Myricom Inc, nor the names of its
17    contributors may be used to endorse or promote products derived from
18    this software without specific prior written permission.
19
20THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30POSSIBILITY OF SUCH DAMAGE.
31
32***************************************************************************/
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: head/sys/dev/mxge/if_mxge.c 163467 2006-10-17 14:39:19Z gallatin $");
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/linker.h>
40#include <sys/firmware.h>
41#include <sys/endian.h>
42#include <sys/sockio.h>
43#include <sys/mbuf.h>
44#include <sys/malloc.h>
45#include <sys/kdb.h>
46#include <sys/kernel.h>
47#include <sys/module.h>
48#include <sys/memrange.h>
49#include <sys/socket.h>
50#include <sys/sysctl.h>
51#include <sys/sx.h>
52
53#include <net/if.h>
54#include <net/if_arp.h>
55#include <net/ethernet.h>
56#include <net/if_dl.h>
57#include <net/if_media.h>
58
59#include <net/bpf.h>
60
61#include <net/if_types.h>
62#include <net/if_vlan_var.h>
63#include <net/zlib.h>
64
65#include <netinet/in_systm.h>
66#include <netinet/in.h>
67#include <netinet/ip.h>
68#include <netinet/tcp.h>
69
70#include <machine/bus.h>
71#include <machine/resource.h>
72#include <sys/bus.h>
73#include <sys/rman.h>
74
75#include <dev/pci/pcireg.h>
76#include <dev/pci/pcivar.h>
77
78#include <vm/vm.h>		/* for pmap_mapdev() */
79#include <vm/pmap.h>
80
81#include <dev/mxge/mxge_mcp.h>
82#include <dev/mxge/mcp_gen_header.h>
83#include <dev/mxge/if_mxge_var.h>
84
85/* tunable params */
86static int mxge_nvidia_ecrc_enable = 1;
87static int mxge_max_intr_slots = 1024;
88static int mxge_intr_coal_delay = 30;
89static int mxge_deassert_wait = 1;
90static int mxge_flow_control = 1;
91static int mxge_verbose = 0;
92static char *mxge_fw_unaligned = "mxge_ethp_z8e";
93static char *mxge_fw_aligned = "mxge_eth_z8e";
94
95static int mxge_probe(device_t dev);
96static int mxge_attach(device_t dev);
97static int mxge_detach(device_t dev);
98static int mxge_shutdown(device_t dev);
99static void mxge_intr(void *arg);
100
101static device_method_t mxge_methods[] =
102{
103  /* Device interface */
104  DEVMETHOD(device_probe, mxge_probe),
105  DEVMETHOD(device_attach, mxge_attach),
106  DEVMETHOD(device_detach, mxge_detach),
107  DEVMETHOD(device_shutdown, mxge_shutdown),
108  {0, 0}
109};
110
111static driver_t mxge_driver =
112{
113  "mxge",
114  mxge_methods,
115  sizeof(mxge_softc_t),
116};
117
118static devclass_t mxge_devclass;
119
120/* Declare ourselves to be a child of the PCI bus.*/
121DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
122MODULE_DEPEND(mxge, firmware, 1, 1, 1);
123
124static int
125mxge_probe(device_t dev)
126{
127  if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
128      (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
129	  device_set_desc(dev, "Myri10G-PCIE-8A");
130	  return 0;
131  }
132  return ENXIO;
133}
134
135static void
136mxge_enable_wc(mxge_softc_t *sc)
137{
138	struct mem_range_desc mrdesc;
139	vm_paddr_t pa;
140	vm_offset_t len;
141	int err, action;
142
143	pa = rman_get_start(sc->mem_res);
144	len = rman_get_size(sc->mem_res);
145	mrdesc.mr_base = pa;
146	mrdesc.mr_len = len;
147	mrdesc.mr_flags = MDF_WRITECOMBINE;
148	action = MEMRANGE_SET_UPDATE;
149	strcpy((char *)&mrdesc.mr_owner, "mxge");
150	err = mem_range_attr_set(&mrdesc, &action);
151	if (err != 0) {
152		device_printf(sc->dev,
153			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
154			      (unsigned long)pa, (unsigned long)len, err);
155	} else {
156		sc->wc = 1;
157	}
158}
159
160
161/* callback to get our DMA address */
162static void
163mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
164			 int error)
165{
166	if (error == 0) {
167		*(bus_addr_t *) arg = segs->ds_addr;
168	}
169}
170
171static int
172mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
173		   bus_size_t alignment)
174{
175	int err;
176	device_t dev = sc->dev;
177
178	/* allocate DMAable memory tags */
179	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
180				 alignment,		/* alignment */
181				 4096,			/* boundary */
182				 BUS_SPACE_MAXADDR,	/* low */
183				 BUS_SPACE_MAXADDR,	/* high */
184				 NULL, NULL,		/* filter */
185				 bytes,			/* maxsize */
186				 1,			/* num segs */
187				 4096,			/* maxsegsize */
188				 BUS_DMA_COHERENT,	/* flags */
189				 NULL, NULL,		/* lock */
190				 &dma->dmat);		/* tag */
191	if (err != 0) {
192		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
193		return err;
194	}
195
196	/* allocate DMAable memory & map */
197	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
198			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
199				| BUS_DMA_ZERO),  &dma->map);
200	if (err != 0) {
201		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
202		goto abort_with_dmat;
203	}
204
205	/* load the memory */
206	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
207			      mxge_dmamap_callback,
208			      (void *)&dma->bus_addr, 0);
209	if (err != 0) {
210		device_printf(dev, "couldn't load map (err = %d)\n", err);
211		goto abort_with_mem;
212	}
213	return 0;
214
215abort_with_mem:
216	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
217abort_with_dmat:
218	(void)bus_dma_tag_destroy(dma->dmat);
219	return err;
220}
221
222
223static void
224mxge_dma_free(mxge_dma_t *dma)
225{
226	bus_dmamap_unload(dma->dmat, dma->map);
227	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
228	(void)bus_dma_tag_destroy(dma->dmat);
229}
230
231/*
232 * The eeprom strings on the lanaiX have the format
233 * SN=x\0
234 * MAC=x:x:x:x:x:x\0
235 * PC=text\0
236 */
237
238static int
239mxge_parse_strings(mxge_softc_t *sc)
240{
241#define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
242
243	char *ptr, *limit;
244	int i, found_mac;
245
246	ptr = sc->eeprom_strings;
247	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
248	found_mac = 0;
249	while (ptr < limit && *ptr != '\0') {
250		if (memcmp(ptr, "MAC=", 4) == 0) {
251			ptr += 1;
252			sc->mac_addr_string = ptr;
253			for (i = 0; i < 6; i++) {
254				ptr += 3;
255				if ((ptr + 2) > limit)
256					goto abort;
257				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
258				found_mac = 1;
259			}
260		} else if (memcmp(ptr, "PC=", 3) == 0) {
261			ptr += 3;
262			strncpy(sc->product_code_string, ptr,
263				sizeof (sc->product_code_string) - 1);
264		} else if (memcmp(ptr, "SN=", 3) == 0) {
265			ptr += 3;
266			strncpy(sc->serial_number_string, ptr,
267				sizeof (sc->serial_number_string) - 1);
268		}
269		MXGE_NEXT_STRING(ptr);
270	}
271
272	if (found_mac)
273		return 0;
274
275 abort:
276	device_printf(sc->dev, "failed to parse eeprom_strings\n");
277
278	return ENXIO;
279}
280
281#if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
282static int
283mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
284{
285	uint32_t val;
286	unsigned long off;
287	char *va, *cfgptr;
288	uint16_t vendor_id, device_id;
289	uintptr_t bus, slot, func, ivend, idev;
290	uint32_t *ptr32;
291
292	/* XXXX
293	   Test below is commented because it is believed that doing
294	   config read/write beyond 0xff will access the config space
295	   for the next larger function.  Uncomment this and remove
296	   the hacky pmap_mapdev() way of accessing config space when
297	   FreeBSD grows support for extended pcie config space access
298	*/
299#if 0
300	/* See if we can, by some miracle, access the extended
301	   config space */
302	val = pci_read_config(pdev, 0x178, 4);
303	if (val != 0xffffffff) {
304		val |= 0x40;
305		pci_write_config(pdev, 0x178, val, 4);
306		return 0;
307	}
308#endif
309	/* Rather than using normal pci config space writes, we must
310	 * map the Nvidia config space ourselves.  This is because on
311	 * opteron/nvidia class machine the 0xe000000 mapping is
312	 * handled by the nvidia chipset, that means the internal PCI
313	 * device (the on-chip northbridge), or the amd-8131 bridge
314	 * and things behind them are not visible by this method.
315	 */
316
317	BUS_READ_IVAR(device_get_parent(pdev), pdev,
318		      PCI_IVAR_BUS, &bus);
319	BUS_READ_IVAR(device_get_parent(pdev), pdev,
320		      PCI_IVAR_SLOT, &slot);
321	BUS_READ_IVAR(device_get_parent(pdev), pdev,
322		      PCI_IVAR_FUNCTION, &func);
323	BUS_READ_IVAR(device_get_parent(pdev), pdev,
324		      PCI_IVAR_VENDOR, &ivend);
325	BUS_READ_IVAR(device_get_parent(pdev), pdev,
326		      PCI_IVAR_DEVICE, &idev);
327
328	off =  0xe0000000UL
329		+ 0x00100000UL * (unsigned long)bus
330		+ 0x00001000UL * (unsigned long)(func
331						 + 8 * slot);
332
333	/* map it into the kernel */
334	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
335
336
337	if (va == NULL) {
338		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
339		return EIO;
340	}
341	/* get a pointer to the config space mapped into the kernel */
342	cfgptr = va + (off & PAGE_MASK);
343
344	/* make sure that we can really access it */
345	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
346	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
347	if (! (vendor_id == ivend && device_id == idev)) {
348		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
349			      vendor_id, device_id);
350		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
351		return EIO;
352	}
353
354	ptr32 = (uint32_t*)(cfgptr + 0x178);
355	val = *ptr32;
356
357	if (val == 0xffffffff) {
358		device_printf(sc->dev, "extended mapping failed\n");
359		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
360		return EIO;
361	}
362	*ptr32 = val | 0x40;
363	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
364	if (mxge_verbose)
365		device_printf(sc->dev,
366			      "Enabled ECRC on upstream Nvidia bridge "
367			      "at %d:%d:%d\n",
368			      (int)bus, (int)slot, (int)func);
369	return 0;
370}
371#else
372static int
373mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
374{
375	device_printf(sc->dev,
376		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
377	return ENXIO;
378}
379#endif
380/*
381 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
382 * when the PCI-E Completion packets are aligned on an 8-byte
383 * boundary.  Some PCI-E chip sets always align Completion packets; on
384 * the ones that do not, the alignment can be enforced by enabling
385 * ECRC generation (if supported).
386 *
387 * When PCI-E Completion packets are not aligned, it is actually more
388 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
389 *
390 * If the driver can neither enable ECRC nor verify that it has
391 * already been enabled, then it must use a firmware image which works
392 * around unaligned completion packets (ethp_z8e.dat), and it should
393 * also ensure that it never gives the device a Read-DMA which is
394 * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
395 * enabled, then the driver should use the aligned (eth_z8e.dat)
396 * firmware image, and set tx.boundary to 4KB.
397 */
398
399static void
400mxge_select_firmware(mxge_softc_t *sc)
401{
402	int err, aligned = 0;
403	device_t pdev;
404	uint16_t pvend, pdid;
405
406	pdev = device_get_parent(device_get_parent(sc->dev));
407	if (pdev == NULL) {
408		device_printf(sc->dev, "could not find parent?\n");
409		goto abort;
410	}
411	pvend = pci_read_config(pdev, PCIR_VENDOR, 2);
412	pdid = pci_read_config(pdev, PCIR_DEVICE, 2);
413
414	/* see if we can enable ECRC's on an upstream
415	   Nvidia bridge */
416	if (mxge_nvidia_ecrc_enable &&
417	    (pvend == 0x10de && pdid == 0x005d)) {
418		err = mxge_enable_nvidia_ecrc(sc, pdev);
419		if (err == 0) {
420			aligned = 1;
421			if (mxge_verbose)
422				device_printf(sc->dev,
423					      "Assuming aligned completions"
424					      " (ECRC)\n");
425		}
426	}
427	/* see if the upstream bridge is known to
428	   provided aligned completions */
429	if (/* HT2000 */ (pvend == 0x1166 && pdid == 0x0132) ||
430	    /* PLX */    (pvend == 0x10b5 && pdid == 0x8532) ||
431	    /* Intel */   (pvend == 0x8086 &&
432			   /* E5000 */(pdid >= 0x25f7 && pdid <= 0x25fa))) {
433		if (mxge_verbose)
434			device_printf(sc->dev,
435				      "Assuming aligned completions "
436				      "(0x%x:0x%x)\n", pvend, pdid);
437	}
438
439abort:
440	if (aligned) {
441		sc->fw_name = mxge_fw_aligned;
442		sc->tx.boundary = 4096;
443	} else {
444		sc->fw_name = mxge_fw_unaligned;
445		sc->tx.boundary = 2048;
446	}
447}
448
449union qualhack
450{
451        const char *ro_char;
452        char *rw_char;
453};
454
455static int
456mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
457{
458	int major, minor;
459
460	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
461		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
462			      be32toh(hdr->mcp_type));
463		return EIO;
464	}
465
466	/* save firmware version for sysctl */
467	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
468	if (mxge_verbose)
469		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
470
471	sscanf(sc->fw_version, "%d.%d", &major, &minor);
472
473	if (!(major == MXGEFW_VERSION_MAJOR
474	      && minor == MXGEFW_VERSION_MINOR)) {
475		device_printf(sc->dev, "Found firmware version %s\n",
476			      sc->fw_version);
477		device_printf(sc->dev, "Driver needs %d.%d\n",
478			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
479		return EINVAL;
480	}
481	return 0;
482
483}
484
485static int
486mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
487{
488	struct firmware *fw;
489	const mcp_gen_header_t *hdr;
490	unsigned hdr_offset;
491	const char *fw_data;
492	union qualhack hack;
493	int status;
494	unsigned int i;
495	char dummy;
496
497
498	fw = firmware_get(sc->fw_name);
499
500	if (fw == NULL) {
501		device_printf(sc->dev, "Could not find firmware image %s\n",
502			      sc->fw_name);
503		return ENOENT;
504	}
505	if (fw->datasize > *limit ||
506	    fw->datasize < MCP_HEADER_PTR_OFFSET + 4) {
507		device_printf(sc->dev, "Firmware image %s too large (%d/%d)\n",
508			      sc->fw_name, (int)fw->datasize, (int) *limit);
509		status = ENOSPC;
510		goto abort_with_fw;
511	}
512	*limit = fw->datasize;
513
514	/* check id */
515	fw_data = (const char *)fw->data;
516	hdr_offset = htobe32(*(const uint32_t *)
517			     (fw_data + MCP_HEADER_PTR_OFFSET));
518	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->datasize) {
519		device_printf(sc->dev, "Bad firmware file");
520		status = EIO;
521		goto abort_with_fw;
522	}
523	hdr = (const void*)(fw_data + hdr_offset);
524
525	status = mxge_validate_firmware(sc, hdr);
526	if (status != 0)
527		goto abort_with_fw;
528
529	hack.ro_char = fw_data;
530	/* Copy the inflated firmware to NIC SRAM. */
531	for (i = 0; i < *limit; i += 256) {
532		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
533			      hack.rw_char + i,
534			      min(256U, (unsigned)(*limit - i)));
535		mb();
536		dummy = *sc->sram;
537		mb();
538	}
539
540	status = 0;
541abort_with_fw:
542	firmware_put(fw, FIRMWARE_UNLOAD);
543	return status;
544}
545
546/*
547 * Enable or disable periodic RDMAs from the host to make certain
548 * chipsets resend dropped PCIe messages
549 */
550
551static void
552mxge_dummy_rdma(mxge_softc_t *sc, int enable)
553{
554	char buf_bytes[72];
555	volatile uint32_t *confirm;
556	volatile char *submit;
557	uint32_t *buf, dma_low, dma_high;
558	int i;
559
560	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
561
562	/* clear confirmation addr */
563	confirm = (volatile uint32_t *)sc->cmd;
564	*confirm = 0;
565	mb();
566
567	/* send an rdma command to the PCIe engine, and wait for the
568	   response in the confirmation address.  The firmware should
569	   write a -1 there to indicate it is alive and well
570	*/
571
572	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
573	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
574	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
575	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
576	buf[2] = htobe32(0xffffffff);		/* confirm data */
577	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
578	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
579	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
580	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
581	buf[5] = htobe32(enable);			/* enable? */
582
583
584	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
585
586	mxge_pio_copy(submit, buf, 64);
587	mb();
588	DELAY(1000);
589	mb();
590	i = 0;
591	while (*confirm != 0xffffffff && i < 20) {
592		DELAY(1000);
593		i++;
594	}
595	if (*confirm != 0xffffffff) {
596		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
597			      (enable ? "enable" : "disable"), confirm,
598			      *confirm);
599	}
600	return;
601}
602
603static int
604mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
605{
606	mcp_cmd_t *buf;
607	char buf_bytes[sizeof(*buf) + 8];
608	volatile mcp_cmd_response_t *response = sc->cmd;
609	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
610	uint32_t dma_low, dma_high;
611	int sleep_total = 0;
612
613	/* ensure buf is aligned to 8 bytes */
614	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
615
616	buf->data0 = htobe32(data->data0);
617	buf->data1 = htobe32(data->data1);
618	buf->data2 = htobe32(data->data2);
619	buf->cmd = htobe32(cmd);
620	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
621	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
622
623	buf->response_addr.low = htobe32(dma_low);
624	buf->response_addr.high = htobe32(dma_high);
625	mtx_lock(&sc->cmd_lock);
626	response->result = 0xffffffff;
627	mb();
628	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
629
630	/* wait up to 20ms */
631	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
632		bus_dmamap_sync(sc->cmd_dma.dmat,
633				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
634		mb();
635		if (response->result != 0xffffffff) {
636			if (response->result == 0) {
637				data->data0 = be32toh(response->data);
638				mtx_unlock(&sc->cmd_lock);
639				return 0;
640			} else {
641				device_printf(sc->dev,
642					      "mxge: command %d "
643					      "failed, result = %d\n",
644					      cmd, be32toh(response->result));
645				mtx_unlock(&sc->cmd_lock);
646				return ENXIO;
647			}
648		}
649		DELAY(1000);
650	}
651	mtx_unlock(&sc->cmd_lock);
652	device_printf(sc->dev, "mxge: command %d timed out"
653		      "result = %d\n",
654		      cmd, be32toh(response->result));
655	return EAGAIN;
656}
657
658static int
659mxge_adopt_running_firmware(mxge_softc_t *sc)
660{
661	struct mcp_gen_header *hdr;
662	const size_t bytes = sizeof (struct mcp_gen_header);
663	size_t hdr_offset;
664	int status;
665
666	/* find running firmware header */
667	hdr_offset = htobe32(*(volatile uint32_t *)
668			     (sc->sram + MCP_HEADER_PTR_OFFSET));
669
670	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
671		device_printf(sc->dev,
672			      "Running firmware has bad header offset (%d)\n",
673			      (int)hdr_offset);
674		return EIO;
675	}
676
677	/* copy header of running firmware from SRAM to host memory to
678	 * validate firmware */
679	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
680	if (hdr == NULL) {
681		device_printf(sc->dev, "could not malloc firmware hdr\n");
682		return ENOMEM;
683	}
684	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
685				rman_get_bushandle(sc->mem_res),
686				hdr_offset, (char *)hdr, bytes);
687	status = mxge_validate_firmware(sc, hdr);
688	free(hdr, M_DEVBUF);
689	return status;
690}
691
692
693static int
694mxge_load_firmware(mxge_softc_t *sc)
695{
696	volatile uint32_t *confirm;
697	volatile char *submit;
698	char buf_bytes[72];
699	uint32_t *buf, size, dma_low, dma_high;
700	int status, i;
701
702	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
703
704	size = sc->sram_size;
705	status = mxge_load_firmware_helper(sc, &size);
706	if (status) {
707		/* Try to use the currently running firmware, if
708		   it is new enough */
709		status = mxge_adopt_running_firmware(sc);
710		if (status) {
711			device_printf(sc->dev,
712				      "failed to adopt running firmware\n");
713			return status;
714		}
715		device_printf(sc->dev,
716			      "Successfully adopted running firmware\n");
717		if (sc->tx.boundary == 4096) {
718			device_printf(sc->dev,
719				"Using firmware currently running on NIC"
720				 ".  For optimal\n");
721			device_printf(sc->dev,
722				 "performance consider loading optimized "
723				 "firmware\n");
724		}
725
726	}
727	/* clear confirmation addr */
728	confirm = (volatile uint32_t *)sc->cmd;
729	*confirm = 0;
730	mb();
731	/* send a reload command to the bootstrap MCP, and wait for the
732	   response in the confirmation address.  The firmware should
733	   write a -1 there to indicate it is alive and well
734	*/
735
736	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
737	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
738
739	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
740	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
741	buf[2] = htobe32(0xffffffff);	/* confirm data */
742
743	/* FIX: All newest firmware should un-protect the bottom of
744	   the sram before handoff. However, the very first interfaces
745	   do not. Therefore the handoff copy must skip the first 8 bytes
746	*/
747					/* where the code starts*/
748	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
749	buf[4] = htobe32(size - 8); 	/* length of code */
750	buf[5] = htobe32(8);		/* where to copy to */
751	buf[6] = htobe32(0);		/* where to jump to */
752
753	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
754	mxge_pio_copy(submit, buf, 64);
755	mb();
756	DELAY(1000);
757	mb();
758	i = 0;
759	while (*confirm != 0xffffffff && i < 20) {
760		DELAY(1000*10);
761		i++;
762		bus_dmamap_sync(sc->cmd_dma.dmat,
763				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
764	}
765	if (*confirm != 0xffffffff) {
766		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
767			confirm, *confirm);
768
769		return ENXIO;
770	}
771	return 0;
772}
773
774static int
775mxge_update_mac_address(mxge_softc_t *sc)
776{
777	mxge_cmd_t cmd;
778	uint8_t *addr = sc->mac_addr;
779	int status;
780
781
782	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
783		     | (addr[2] << 8) | addr[3]);
784
785	cmd.data1 = ((addr[4] << 8) | (addr[5]));
786
787	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
788	return status;
789}
790
791static int
792mxge_change_pause(mxge_softc_t *sc, int pause)
793{
794	mxge_cmd_t cmd;
795	int status;
796
797	if (pause)
798		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
799				       &cmd);
800	else
801		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
802				       &cmd);
803
804	if (status) {
805		device_printf(sc->dev, "Failed to set flow control mode\n");
806		return ENXIO;
807	}
808	sc->pause = pause;
809	return 0;
810}
811
812static void
813mxge_change_promisc(mxge_softc_t *sc, int promisc)
814{
815	mxge_cmd_t cmd;
816	int status;
817
818	if (promisc)
819		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
820				       &cmd);
821	else
822		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
823				       &cmd);
824
825	if (status) {
826		device_printf(sc->dev, "Failed to set promisc mode\n");
827	}
828}
829
830static void
831mxge_set_multicast_list(mxge_softc_t *sc)
832{
833	mxge_cmd_t cmd;
834	struct ifmultiaddr *ifma;
835	struct ifnet *ifp = sc->ifp;
836	int err;
837
838	/* This firmware is known to not support multicast */
839	if (!sc->fw_multicast_support)
840		return;
841
842	/* Disable multicast filtering while we play with the lists*/
843	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
844	if (err != 0) {
845		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
846		       " error status: %d\n", err);
847		return;
848	}
849
850
851	if (ifp->if_flags & IFF_ALLMULTI)
852		/* request to disable multicast filtering, so quit here */
853		return;
854
855	/* Flush all the filters */
856
857	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
858	if (err != 0) {
859		device_printf(sc->dev,
860			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
861			      ", error status: %d\n", err);
862		return;
863	}
864
865	/* Walk the multicast list, and add each address */
866
867	IF_ADDR_LOCK(ifp);
868	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
869		if (ifma->ifma_addr->sa_family != AF_LINK)
870			continue;
871		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
872		      &cmd.data0, 4);
873		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
874		      &cmd.data1, 2);
875		cmd.data0 = htonl(cmd.data0);
876		cmd.data1 = htonl(cmd.data1);
877		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
878		if (err != 0) {
879			device_printf(sc->dev, "Failed "
880			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
881			       "%d\t", err);
882			/* abort, leaving multicast filtering off */
883			IF_ADDR_UNLOCK(ifp);
884			return;
885		}
886	}
887	IF_ADDR_UNLOCK(ifp);
888	/* Enable multicast filtering */
889	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
890	if (err != 0) {
891		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
892		       ", error status: %d\n", err);
893	}
894}
895
896
897static int
898mxge_reset(mxge_softc_t *sc)
899{
900
901	mxge_cmd_t cmd;
902	mxge_dma_t dmabench_dma;
903	size_t bytes;
904	int status;
905
906	/* try to send a reset command to the card to see if it
907	   is alive */
908	memset(&cmd, 0, sizeof (cmd));
909	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
910	if (status != 0) {
911		device_printf(sc->dev, "failed reset\n");
912		return ENXIO;
913	}
914
915	mxge_dummy_rdma(sc, 1);
916
917	/* Now exchange information about interrupts  */
918	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);\
919	memset(sc->rx_done.entry, 0, bytes);
920	cmd.data0 = (uint32_t)bytes;
921	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
922	cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
923	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
924	status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
925
926	status |= mxge_send_cmd(sc,
927				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
928
929
930	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
931
932	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
933	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
934
935
936	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
937				&cmd);
938	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
939	if (status != 0) {
940		device_printf(sc->dev, "failed set interrupt parameters\n");
941		return status;
942	}
943
944
945	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
946
947
948	/* run a DMA benchmark */
949	sc->read_dma = sc->write_dma = sc->read_write_dma = 0;
950	status = mxge_dma_alloc(sc, &dmabench_dma, 4096, 4096);
951	if (status)
952		goto dmabench_fail;
953
954	/* Read DMA */
955	cmd.data0 = MXGE_LOWPART_TO_U32(dmabench_dma.bus_addr);
956	cmd.data1 = MXGE_HIGHPART_TO_U32(dmabench_dma.bus_addr);
957	cmd.data2 = sc->tx.boundary * 0x10000;
958
959	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
960	if (status != 0)
961		device_printf(sc->dev, "read dma benchmark failed\n");
962	else
963		sc->read_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
964			(cmd.data0 & 0xffff);
965
966	/* Write DMA */
967	cmd.data0 = MXGE_LOWPART_TO_U32(dmabench_dma.bus_addr);
968	cmd.data1 = MXGE_HIGHPART_TO_U32(dmabench_dma.bus_addr);
969	cmd.data2 = sc->tx.boundary * 0x1;
970	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
971	if (status != 0)
972		device_printf(sc->dev, "write dma benchmark failed\n");
973	else
974		sc->write_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
975			(cmd.data0 & 0xffff);
976	/* Read/Write DMA */
977	cmd.data0 = MXGE_LOWPART_TO_U32(dmabench_dma.bus_addr);
978	cmd.data1 = MXGE_HIGHPART_TO_U32(dmabench_dma.bus_addr);
979	cmd.data2 = sc->tx.boundary * 0x10001;
980	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
981	if (status != 0)
982		device_printf(sc->dev, "read/write dma benchmark failed\n");
983	else
984		sc->read_write_dma =
985			((cmd.data0>>16) * sc->tx.boundary * 2 * 2) /
986			(cmd.data0 & 0xffff);
987
988	mxge_dma_free(&dmabench_dma);
989
990dmabench_fail:
991	/* reset mcp/driver shared state back to 0 */
992	bzero(sc->rx_done.entry, bytes);
993	sc->rx_done.idx = 0;
994	sc->rx_done.cnt = 0;
995	sc->tx.req = 0;
996	sc->tx.done = 0;
997	sc->tx.pkt_done = 0;
998	sc->rx_big.cnt = 0;
999	sc->rx_small.cnt = 0;
1000	sc->rdma_tags_available = 15;
1001	status = mxge_update_mac_address(sc);
1002	mxge_change_promisc(sc, 0);
1003	mxge_change_pause(sc, sc->pause);
1004	mxge_set_multicast_list(sc);
1005	return status;
1006}
1007
1008static int
1009mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1010{
1011        mxge_softc_t *sc;
1012        unsigned int intr_coal_delay;
1013        int err;
1014
1015        sc = arg1;
1016        intr_coal_delay = sc->intr_coal_delay;
1017        err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1018        if (err != 0) {
1019                return err;
1020        }
1021        if (intr_coal_delay == sc->intr_coal_delay)
1022                return 0;
1023
1024        if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1025                return EINVAL;
1026
1027	sx_xlock(&sc->driver_lock);
1028	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1029	sc->intr_coal_delay = intr_coal_delay;
1030
1031	sx_xunlock(&sc->driver_lock);
1032        return err;
1033}
1034
1035static int
1036mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1037{
1038        mxge_softc_t *sc;
1039        unsigned int enabled;
1040        int err;
1041
1042        sc = arg1;
1043        enabled = sc->pause;
1044        err = sysctl_handle_int(oidp, &enabled, arg2, req);
1045        if (err != 0) {
1046                return err;
1047        }
1048        if (enabled == sc->pause)
1049                return 0;
1050
1051	sx_xlock(&sc->driver_lock);
1052	err = mxge_change_pause(sc, enabled);
1053	sx_xunlock(&sc->driver_lock);
1054        return err;
1055}
1056
1057static int
1058mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1059{
1060        int err;
1061
1062        if (arg1 == NULL)
1063                return EFAULT;
1064        arg2 = be32toh(*(int *)arg1);
1065        arg1 = NULL;
1066        err = sysctl_handle_int(oidp, arg1, arg2, req);
1067
1068        return err;
1069}
1070
1071static void
1072mxge_add_sysctls(mxge_softc_t *sc)
1073{
1074	struct sysctl_ctx_list *ctx;
1075	struct sysctl_oid_list *children;
1076	mcp_irq_data_t *fw;
1077
1078	ctx = device_get_sysctl_ctx(sc->dev);
1079	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1080	fw = sc->fw_stats;
1081
1082	/* random information */
1083	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1084		       "firmware_version",
1085		       CTLFLAG_RD, &sc->fw_version,
1086		       0, "firmware version");
1087	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1088		       "serial_number",
1089		       CTLFLAG_RD, &sc->serial_number_string,
1090		       0, "serial number");
1091	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1092		       "product_code",
1093		       CTLFLAG_RD, &sc->product_code_string,
1094		       0, "product_code");
1095	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1096		       "tx_boundary",
1097		       CTLFLAG_RD, &sc->tx.boundary,
1098		       0, "tx_boundary");
1099	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1100		       "write_combine",
1101		       CTLFLAG_RD, &sc->wc,
1102		       0, "write combining PIO?");
1103	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1104		       "read_dma_MBs",
1105		       CTLFLAG_RD, &sc->read_dma,
1106		       0, "DMA Read speed in MB/s");
1107	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1108		       "write_dma_MBs",
1109		       CTLFLAG_RD, &sc->write_dma,
1110		       0, "DMA Write speed in MB/s");
1111	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1112		       "read_write_dma_MBs",
1113		       CTLFLAG_RD, &sc->read_write_dma,
1114		       0, "DMA concurrent Read/Write speed in MB/s");
1115
1116
1117	/* performance related tunables */
1118	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1119			"intr_coal_delay",
1120			CTLTYPE_INT|CTLFLAG_RW, sc,
1121			0, mxge_change_intr_coal,
1122			"I", "interrupt coalescing delay in usecs");
1123
1124	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1125			"flow_control_enabled",
1126			CTLTYPE_INT|CTLFLAG_RW, sc,
1127			0, mxge_change_flow_control,
1128			"I", "interrupt coalescing delay in usecs");
1129
1130	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1131		       "deassert_wait",
1132		       CTLFLAG_RW, &mxge_deassert_wait,
1133		       0, "Wait for IRQ line to go low in ihandler");
1134
1135	/* stats block from firmware is in network byte order.
1136	   Need to swap it */
1137	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1138			"link_up",
1139			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1140			0, mxge_handle_be32,
1141			"I", "link up");
1142	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1143			"rdma_tags_available",
1144			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1145			0, mxge_handle_be32,
1146			"I", "rdma_tags_available");
1147	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1148			"dropped_link_overflow",
1149			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1150			0, mxge_handle_be32,
1151			"I", "dropped_link_overflow");
1152	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1153			"dropped_link_error_or_filtered",
1154			CTLTYPE_INT|CTLFLAG_RD,
1155			&fw->dropped_link_error_or_filtered,
1156			0, mxge_handle_be32,
1157			"I", "dropped_link_error_or_filtered");
1158	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1159			"dropped_multicast_filtered",
1160			CTLTYPE_INT|CTLFLAG_RD,
1161			&fw->dropped_multicast_filtered,
1162			0, mxge_handle_be32,
1163			"I", "dropped_multicast_filtered");
1164	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1165			"dropped_runt",
1166			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1167			0, mxge_handle_be32,
1168			"I", "dropped_runt");
1169	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1170			"dropped_overrun",
1171			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1172			0, mxge_handle_be32,
1173			"I", "dropped_overrun");
1174	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1175			"dropped_no_small_buffer",
1176			CTLTYPE_INT|CTLFLAG_RD,
1177			&fw->dropped_no_small_buffer,
1178			0, mxge_handle_be32,
1179			"I", "dropped_no_small_buffer");
1180	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1181			"dropped_no_big_buffer",
1182			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1183			0, mxge_handle_be32,
1184			"I", "dropped_no_big_buffer");
1185
1186	/* host counters exported for debugging */
1187	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1188		       "rx_small_cnt",
1189		       CTLFLAG_RD, &sc->rx_small.cnt,
1190		       0, "rx_small_cnt");
1191	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1192		       "rx_big_cnt",
1193		       CTLFLAG_RD, &sc->rx_big.cnt,
1194		       0, "rx_small_cnt");
1195	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1196		       "tx_req",
1197		       CTLFLAG_RD, &sc->tx.req,
1198		       0, "tx_req");
1199	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1200		       "tx_done",
1201		       CTLFLAG_RD, &sc->tx.done,
1202		       0, "tx_done");
1203	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1204		       "tx_pkt_done",
1205		       CTLFLAG_RD, &sc->tx.pkt_done,
1206		       0, "tx_done");
1207
1208	/* verbose printing? */
1209	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1210		       "verbose",
1211		       CTLFLAG_RW, &mxge_verbose,
1212		       0, "verbose printing");
1213
1214}
1215
1216/* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1217   backwards one at a time and handle ring wraps */
1218
1219static inline void
1220mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1221			    mcp_kreq_ether_send_t *src, int cnt)
1222{
1223        int idx, starting_slot;
1224        starting_slot = tx->req;
1225        while (cnt > 1) {
1226                cnt--;
1227                idx = (starting_slot + cnt) & tx->mask;
1228                mxge_pio_copy(&tx->lanai[idx],
1229			      &src[cnt], sizeof(*src));
1230                mb();
1231        }
1232}
1233
1234/*
1235 * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1236 * at most 32 bytes at a time, so as to avoid involving the software
1237 * pio handler in the nic.   We re-write the first segment's flags
1238 * to mark them valid only after writing the entire chain
1239 */
1240
1241static inline void
1242mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1243                  int cnt)
1244{
1245        int idx, i;
1246        uint32_t *src_ints;
1247	volatile uint32_t *dst_ints;
1248        mcp_kreq_ether_send_t *srcp;
1249	volatile mcp_kreq_ether_send_t *dstp, *dst;
1250	uint8_t last_flags;
1251
1252        idx = tx->req & tx->mask;
1253
1254	last_flags = src->flags;
1255	src->flags = 0;
1256        mb();
1257        dst = dstp = &tx->lanai[idx];
1258        srcp = src;
1259
1260        if ((idx + cnt) < tx->mask) {
1261                for (i = 0; i < (cnt - 1); i += 2) {
1262                        mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1263                        mb(); /* force write every 32 bytes */
1264                        srcp += 2;
1265                        dstp += 2;
1266                }
1267        } else {
1268                /* submit all but the first request, and ensure
1269                   that it is submitted below */
1270                mxge_submit_req_backwards(tx, src, cnt);
1271                i = 0;
1272        }
1273        if (i < cnt) {
1274                /* submit the first request */
1275                mxge_pio_copy(dstp, srcp, sizeof(*src));
1276                mb(); /* barrier before setting valid flag */
1277        }
1278
1279        /* re-write the last 32-bits with the valid flags */
1280        src->flags = last_flags;
1281        src_ints = (uint32_t *)src;
1282        src_ints+=3;
1283        dst_ints = (volatile uint32_t *)dst;
1284        dst_ints+=3;
1285        *dst_ints =  *src_ints;
1286        tx->req += cnt;
1287        mb();
1288}
1289
1290static inline void
1291mxge_submit_req_wc(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1292{
1293    tx->req += cnt;
1294    mb();
1295    while (cnt >= 4) {
1296	    mxge_pio_copy((volatile char *)tx->wc_fifo, src, 64);
1297	    mb();
1298	    src += 4;
1299	    cnt -= 4;
1300    }
1301    if (cnt > 0) {
1302	    /* pad it to 64 bytes.  The src is 64 bytes bigger than it
1303	       needs to be so that we don't overrun it */
1304	    mxge_pio_copy(tx->wc_fifo + MXGEFW_ETH_SEND_OFFSET(cnt), src, 64);
1305	    mb();
1306    }
1307}
1308
1309static void
1310mxge_encap_tso(mxge_softc_t *sc, struct mbuf *m, int busdma_seg_cnt)
1311{
1312	mxge_tx_buf_t *tx;
1313	mcp_kreq_ether_send_t *req;
1314	bus_dma_segment_t *seg;
1315	struct ether_header *eh;
1316	struct ip *ip;
1317	struct tcphdr *tcp;
1318	uint32_t low, high_swapped;
1319	int len, seglen, cum_len, cum_len_next;
1320	int next_is_first, chop, cnt, rdma_count, small;
1321	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1322	uint8_t flags, flags_next;
1323	static int once;
1324
1325	mss = m->m_pkthdr.tso_segsz;
1326
1327	/* negative cum_len signifies to the
1328	 * send loop that we are still in the
1329	 * header portion of the TSO packet.
1330	 */
1331
1332	/* ensure we have the ethernet, IP and TCP
1333	   header together in the first mbuf, copy
1334	   it to a scratch buffer if not */
1335	if (__predict_false(m->m_len < sizeof (*eh)
1336			    + sizeof (*ip))) {
1337		m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1338			   sc->scratch);
1339		eh = (struct ether_header *)sc->scratch;
1340	} else {
1341		eh = mtod(m, struct ether_header *);
1342	}
1343	ip = (struct ip *) (eh + 1);
1344	if (__predict_false(m->m_len < sizeof (*eh) + (ip->ip_hl << 2)
1345			    + sizeof (*tcp))) {
1346		m_copydata(m, 0, sizeof (*eh) + (ip->ip_hl << 2)
1347			   + sizeof (*tcp),  sc->scratch);
1348		eh = (struct ether_header *) sc->scratch;
1349		ip = (struct ip *) (eh + 1);
1350	}
1351
1352	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1353	cum_len = -(sizeof (*eh) + ((ip->ip_hl + tcp->th_off) << 2));
1354
1355	/* TSO implies checksum offload on this hardware */
1356	cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1357	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1358
1359
1360	/* for TSO, pseudo_hdr_offset holds mss.
1361	 * The firmware figures out where to put
1362	 * the checksum by parsing the header. */
1363	pseudo_hdr_offset = htobe16(mss);
1364
1365	tx = &sc->tx;
1366	req = tx->req_list;
1367	seg = tx->seg_list;
1368	cnt = 0;
1369	rdma_count = 0;
1370	/* "rdma_count" is the number of RDMAs belonging to the
1371	 * current packet BEFORE the current send request. For
1372	 * non-TSO packets, this is equal to "count".
1373	 * For TSO packets, rdma_count needs to be reset
1374	 * to 0 after a segment cut.
1375	 *
1376	 * The rdma_count field of the send request is
1377	 * the number of RDMAs of the packet starting at
1378	 * that request. For TSO send requests with one ore more cuts
1379	 * in the middle, this is the number of RDMAs starting
1380	 * after the last cut in the request. All previous
1381	 * segments before the last cut implicitly have 1 RDMA.
1382	 *
1383	 * Since the number of RDMAs is not known beforehand,
1384	 * it must be filled-in retroactively - after each
1385	 * segmentation cut or at the end of the entire packet.
1386	 */
1387
1388	while (busdma_seg_cnt) {
1389		/* Break the busdma segment up into pieces*/
1390		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1391		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1392		len = seglen = seg->ds_len;
1393
1394		while (len) {
1395			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1396			cum_len_next = cum_len + seglen;
1397			(req-rdma_count)->rdma_count = rdma_count + 1;
1398			if (__predict_true(cum_len >= 0)) {
1399				/* payload */
1400				chop = (cum_len_next > mss);
1401				cum_len_next = cum_len_next % mss;
1402				next_is_first = (cum_len_next == 0);
1403				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1404				flags_next |= next_is_first *
1405					MXGEFW_FLAGS_FIRST;
1406				rdma_count |= -(chop | next_is_first);
1407				rdma_count += chop & !next_is_first;
1408			} else if (cum_len_next >= 0) {
1409				/* header ends */
1410				rdma_count = -1;
1411				cum_len_next = 0;
1412				seglen = -cum_len;
1413				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1414				flags_next = MXGEFW_FLAGS_TSO_PLD |
1415					MXGEFW_FLAGS_FIRST |
1416					(small * MXGEFW_FLAGS_SMALL);
1417			    }
1418
1419			req->addr_high = high_swapped;
1420			req->addr_low = htobe32(low);
1421			req->pseudo_hdr_offset = pseudo_hdr_offset;
1422			req->pad = 0;
1423			req->rdma_count = 1;
1424			req->length = htobe16(seglen);
1425			req->cksum_offset = cksum_offset;
1426			req->flags = flags | ((cum_len & 1) *
1427					      MXGEFW_FLAGS_ALIGN_ODD);
1428			low += seglen;
1429			len -= seglen;
1430			cum_len = cum_len_next;
1431			flags = flags_next;
1432			req++;
1433			cnt++;
1434			rdma_count++;
1435			if (__predict_false(cksum_offset > seglen))
1436				cksum_offset -= seglen;
1437			else
1438				cksum_offset = 0;
1439			if (__predict_false(cnt > MXGE_MAX_SEND_DESC))
1440				goto drop;
1441		}
1442		busdma_seg_cnt--;
1443		seg++;
1444	}
1445	(req-rdma_count)->rdma_count = rdma_count;
1446
1447	do {
1448		req--;
1449		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1450	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1451
1452	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1453	if (tx->wc_fifo == NULL)
1454		mxge_submit_req(tx, tx->req_list, cnt);
1455	else
1456		mxge_submit_req_wc(tx, tx->req_list, cnt);
1457	return;
1458
1459drop:
1460	m_freem(m);
1461	sc->ifp->if_oerrors++;
1462	if (!once) {
1463		printf("MXGE_MAX_SEND_DESC exceeded via TSO!\n");
1464		printf("mss = %d, %ld!\n", mss, (long)seg - (long)tx->seg_list);
1465		once = 1;
1466	}
1467	return;
1468
1469}
1470
1471static void
1472mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1473{
1474	mcp_kreq_ether_send_t *req;
1475	bus_dma_segment_t *seg;
1476	struct mbuf *m_tmp;
1477	struct ifnet *ifp;
1478	mxge_tx_buf_t *tx;
1479	struct ether_header *eh;
1480	struct ip *ip;
1481	int cnt, cum_len, err, i, idx, odd_flag;
1482	uint16_t pseudo_hdr_offset;
1483        uint8_t flags, cksum_offset;
1484
1485
1486
1487	ifp = sc->ifp;
1488	tx = &sc->tx;
1489
1490	/* (try to) map the frame for DMA */
1491	idx = tx->req & tx->mask;
1492	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1493				      m, tx->seg_list, &cnt,
1494				      BUS_DMA_NOWAIT);
1495	if (err == EFBIG) {
1496		/* Too many segments in the chain.  Try
1497		   to defrag */
1498		m_tmp = m_defrag(m, M_NOWAIT);
1499		if (m_tmp == NULL) {
1500			goto drop;
1501		}
1502		m = m_tmp;
1503		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1504					      tx->info[idx].map,
1505					      m, tx->seg_list, &cnt,
1506					      BUS_DMA_NOWAIT);
1507	}
1508	if (err != 0) {
1509		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1510			      " packet len = %d\n", err, m->m_pkthdr.len);
1511		goto drop;
1512	}
1513	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1514			BUS_DMASYNC_PREWRITE);
1515	tx->info[idx].m = m;
1516
1517
1518	/* TSO is different enough, we handle it in another routine */
1519	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1520		mxge_encap_tso(sc, m, cnt);
1521		return;
1522	}
1523
1524	req = tx->req_list;
1525	cksum_offset = 0;
1526	pseudo_hdr_offset = 0;
1527	flags = MXGEFW_FLAGS_NO_TSO;
1528
1529	/* checksum offloading? */
1530	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1531		/* ensure ip header is in first mbuf, copy
1532		   it to a scratch buffer if not */
1533		if (__predict_false(m->m_len < sizeof (*eh)
1534				    + sizeof (*ip))) {
1535			m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1536				   sc->scratch);
1537			eh = (struct ether_header *)sc->scratch;
1538		} else {
1539			eh = mtod(m, struct ether_header *);
1540		}
1541		ip = (struct ip *) (eh + 1);
1542		cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1543		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1544		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1545		req->cksum_offset = cksum_offset;
1546		flags |= MXGEFW_FLAGS_CKSUM;
1547		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1548	} else {
1549		odd_flag = 0;
1550	}
1551	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1552		flags |= MXGEFW_FLAGS_SMALL;
1553
1554	/* convert segments into a request list */
1555	cum_len = 0;
1556	seg = tx->seg_list;
1557	req->flags = MXGEFW_FLAGS_FIRST;
1558	for (i = 0; i < cnt; i++) {
1559		req->addr_low =
1560			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1561		req->addr_high =
1562			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1563		req->length = htobe16(seg->ds_len);
1564		req->cksum_offset = cksum_offset;
1565		if (cksum_offset > seg->ds_len)
1566			cksum_offset -= seg->ds_len;
1567		else
1568			cksum_offset = 0;
1569		req->pseudo_hdr_offset = pseudo_hdr_offset;
1570		req->pad = 0; /* complete solid 16-byte block */
1571		req->rdma_count = 1;
1572		req->flags |= flags | ((cum_len & 1) * odd_flag);
1573		cum_len += seg->ds_len;
1574		seg++;
1575		req++;
1576		req->flags = 0;
1577	}
1578	req--;
1579	/* pad runts to 60 bytes */
1580	if (cum_len < 60) {
1581		req++;
1582		req->addr_low =
1583			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1584		req->addr_high =
1585			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1586		req->length = htobe16(60 - cum_len);
1587		req->cksum_offset = 0;
1588		req->pseudo_hdr_offset = pseudo_hdr_offset;
1589		req->pad = 0; /* complete solid 16-byte block */
1590		req->rdma_count = 1;
1591		req->flags |= flags | ((cum_len & 1) * odd_flag);
1592		cnt++;
1593	}
1594
1595	tx->req_list[0].rdma_count = cnt;
1596#if 0
1597	/* print what the firmware will see */
1598	for (i = 0; i < cnt; i++) {
1599		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1600		    "cso:%d, flags:0x%x, rdma:%d\n",
1601		    i, (int)ntohl(tx->req_list[i].addr_high),
1602		    (int)ntohl(tx->req_list[i].addr_low),
1603		    (int)ntohs(tx->req_list[i].length),
1604		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1605		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1606		    tx->req_list[i].rdma_count);
1607	}
1608	printf("--------------\n");
1609#endif
1610	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1611	if (tx->wc_fifo == NULL)
1612		mxge_submit_req(tx, tx->req_list, cnt);
1613	else
1614		mxge_submit_req_wc(tx, tx->req_list, cnt);
1615	return;
1616
1617drop:
1618	m_freem(m);
1619	ifp->if_oerrors++;
1620	return;
1621}
1622
1623
1624
1625
1626static inline void
1627mxge_start_locked(mxge_softc_t *sc)
1628{
1629	struct mbuf *m;
1630	struct ifnet *ifp;
1631
1632	ifp = sc->ifp;
1633	while ((sc->tx.mask - (sc->tx.req - sc->tx.done))
1634	       > MXGE_MAX_SEND_DESC) {
1635
1636		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1637		if (m == NULL) {
1638			return;
1639		}
1640		/* let BPF see it */
1641		BPF_MTAP(ifp, m);
1642
1643		/* give it to the nic */
1644		mxge_encap(sc, m);
1645	}
1646	/* ran out of transmit slots */
1647	sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1648}
1649
1650static void
1651mxge_start(struct ifnet *ifp)
1652{
1653	mxge_softc_t *sc = ifp->if_softc;
1654
1655
1656	mtx_lock(&sc->tx_lock);
1657	mxge_start_locked(sc);
1658	mtx_unlock(&sc->tx_lock);
1659}
1660
1661/*
1662 * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1663 * at most 32 bytes at a time, so as to avoid involving the software
1664 * pio handler in the nic.   We re-write the first segment's low
1665 * DMA address to mark it valid only after we write the entire chunk
1666 * in a burst
1667 */
1668static inline void
1669mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1670		mcp_kreq_ether_recv_t *src)
1671{
1672	uint32_t low;
1673
1674	low = src->addr_low;
1675	src->addr_low = 0xffffffff;
1676	mxge_pio_copy(dst, src, 8 * sizeof (*src));
1677	mb();
1678	dst->addr_low = low;
1679	mb();
1680}
1681
1682static int
1683mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1684{
1685	bus_dma_segment_t seg;
1686	struct mbuf *m;
1687	mxge_rx_buf_t *rx = &sc->rx_small;
1688	int cnt, err;
1689
1690	m = m_gethdr(M_DONTWAIT, MT_DATA);
1691	if (m == NULL) {
1692		rx->alloc_fail++;
1693		err = ENOBUFS;
1694		goto done;
1695	}
1696	m->m_len = MHLEN;
1697	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1698				      &seg, &cnt, BUS_DMA_NOWAIT);
1699	if (err != 0) {
1700		m_free(m);
1701		goto done;
1702	}
1703	rx->info[idx].m = m;
1704	rx->shadow[idx].addr_low =
1705		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1706	rx->shadow[idx].addr_high =
1707		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1708
1709done:
1710	if ((idx & 7) == 7) {
1711		if (rx->wc_fifo == NULL)
1712			mxge_submit_8rx(&rx->lanai[idx - 7],
1713					&rx->shadow[idx - 7]);
1714		else {
1715			mb();
1716			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1717		}
1718        }
1719	return err;
1720}
1721
1722static int
1723mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1724{
1725	bus_dma_segment_t seg;
1726	struct mbuf *m;
1727	mxge_rx_buf_t *rx = &sc->rx_big;
1728	int cnt, err;
1729
1730	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, sc->big_bytes);
1731	if (m == NULL) {
1732		rx->alloc_fail++;
1733		err = ENOBUFS;
1734		goto done;
1735	}
1736	m->m_len = sc->big_bytes;
1737	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1738				      &seg, &cnt, BUS_DMA_NOWAIT);
1739	if (err != 0) {
1740		m_free(m);
1741		goto done;
1742	}
1743	rx->info[idx].m = m;
1744	rx->shadow[idx].addr_low =
1745		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1746	rx->shadow[idx].addr_high =
1747		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1748
1749done:
1750	if ((idx & 7) == 7) {
1751		if (rx->wc_fifo == NULL)
1752			mxge_submit_8rx(&rx->lanai[idx - 7],
1753					&rx->shadow[idx - 7]);
1754		else {
1755			mb();
1756			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1757		}
1758        }
1759	return err;
1760}
1761
1762static inline void
1763mxge_rx_csum(struct mbuf *m, int csum)
1764{
1765	struct ether_header *eh;
1766	struct ip *ip;
1767
1768	eh = mtod(m, struct ether_header *);
1769	if (__predict_true(eh->ether_type ==  htons(ETHERTYPE_IP))) {
1770		ip = (struct ip *)(eh + 1);
1771		if (__predict_true(ip->ip_p == IPPROTO_TCP ||
1772				   ip->ip_p == IPPROTO_UDP)) {
1773			m->m_pkthdr.csum_data = csum;
1774			m->m_pkthdr.csum_flags = CSUM_DATA_VALID;
1775		}
1776	}
1777}
1778
1779static inline void
1780mxge_rx_done_big(mxge_softc_t *sc, int len, int csum)
1781{
1782	struct ifnet *ifp;
1783	struct mbuf *m = 0; 		/* -Wunitialized */
1784	struct mbuf *m_prev = 0;	/* -Wunitialized */
1785	struct mbuf *m_head = 0;
1786	bus_dmamap_t old_map;
1787	mxge_rx_buf_t *rx;
1788	int idx;
1789
1790
1791	rx = &sc->rx_big;
1792	ifp = sc->ifp;
1793	while (len > 0) {
1794		idx = rx->cnt & rx->mask;
1795                rx->cnt++;
1796		/* save a pointer to the received mbuf */
1797		m = rx->info[idx].m;
1798		/* try to replace the received mbuf */
1799		if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
1800			goto drop;
1801		}
1802		/* unmap the received buffer */
1803		old_map = rx->info[idx].map;
1804		bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1805		bus_dmamap_unload(rx->dmat, old_map);
1806
1807		/* swap the bus_dmamap_t's */
1808		rx->info[idx].map = rx->extra_map;
1809		rx->extra_map = old_map;
1810
1811		/* chain multiple segments together */
1812		if (!m_head) {
1813			m_head = m;
1814			/* mcp implicitly skips 1st bytes so that
1815			 * packet is properly aligned */
1816			m->m_data += MXGEFW_PAD;
1817			m->m_pkthdr.len = len;
1818			m->m_len = sc->big_bytes - MXGEFW_PAD;
1819		} else {
1820			m->m_len = sc->big_bytes;
1821			m->m_flags &= ~M_PKTHDR;
1822			m_prev->m_next = m;
1823		}
1824		len -= m->m_len;
1825		m_prev = m;
1826	}
1827
1828	/* trim trailing garbage from the last mbuf in the chain.  If
1829	 * there is any garbage, len will be negative */
1830	m->m_len += len;
1831
1832	/* if the checksum is valid, mark it in the mbuf header */
1833	if (sc->csum_flag)
1834		mxge_rx_csum(m_head, csum);
1835
1836	/* pass the frame up the stack */
1837	m_head->m_pkthdr.rcvif = ifp;
1838	ifp->if_ipackets++;
1839	(*ifp->if_input)(ifp, m_head);
1840	return;
1841
1842drop:
1843	/* drop the frame -- the old mbuf(s) are re-cycled by running
1844	   every slot through the allocator */
1845        if (m_head) {
1846                len -= sc->big_bytes;
1847                m_freem(m_head);
1848        } else {
1849                len -= (sc->big_bytes + MXGEFW_PAD);
1850        }
1851        while ((int)len > 0) {
1852                idx = rx->cnt & rx->mask;
1853                rx->cnt++;
1854                m = rx->info[idx].m;
1855                if (0 == (mxge_get_buf_big(sc, rx->extra_map, idx))) {
1856			m_freem(m);
1857			/* unmap the received buffer */
1858			old_map = rx->info[idx].map;
1859			bus_dmamap_sync(rx->dmat, old_map,
1860					BUS_DMASYNC_POSTREAD);
1861			bus_dmamap_unload(rx->dmat, old_map);
1862
1863			/* swap the bus_dmamap_t's */
1864			rx->info[idx].map = rx->extra_map;
1865			rx->extra_map = old_map;
1866		}
1867                len -= sc->big_bytes;
1868        }
1869
1870	ifp->if_ierrors++;
1871
1872}
1873
1874static inline void
1875mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
1876{
1877	struct ifnet *ifp;
1878	struct mbuf *m;
1879	mxge_rx_buf_t *rx;
1880	bus_dmamap_t old_map;
1881	int idx;
1882
1883	ifp = sc->ifp;
1884	rx = &sc->rx_small;
1885	idx = rx->cnt & rx->mask;
1886	rx->cnt++;
1887	/* save a pointer to the received mbuf */
1888	m = rx->info[idx].m;
1889	/* try to replace the received mbuf */
1890	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
1891		/* drop the frame -- the old mbuf is re-cycled */
1892		ifp->if_ierrors++;
1893		return;
1894	}
1895
1896	/* unmap the received buffer */
1897	old_map = rx->info[idx].map;
1898	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1899	bus_dmamap_unload(rx->dmat, old_map);
1900
1901	/* swap the bus_dmamap_t's */
1902	rx->info[idx].map = rx->extra_map;
1903	rx->extra_map = old_map;
1904
1905	/* mcp implicitly skips 1st 2 bytes so that packet is properly
1906	 * aligned */
1907	m->m_data += MXGEFW_PAD;
1908
1909	/* if the checksum is valid, mark it in the mbuf header */
1910	if (sc->csum_flag)
1911		mxge_rx_csum(m, csum);
1912
1913	/* pass the frame up the stack */
1914	m->m_pkthdr.rcvif = ifp;
1915	m->m_len = m->m_pkthdr.len = len;
1916	ifp->if_ipackets++;
1917	(*ifp->if_input)(ifp, m);
1918}
1919
1920static inline void
1921mxge_clean_rx_done(mxge_softc_t *sc)
1922{
1923	mxge_rx_done_t *rx_done = &sc->rx_done;
1924	int limit = 0;
1925	uint16_t length;
1926	uint16_t checksum;
1927
1928
1929	while (rx_done->entry[rx_done->idx].length != 0) {
1930		length = ntohs(rx_done->entry[rx_done->idx].length);
1931		rx_done->entry[rx_done->idx].length = 0;
1932		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
1933		if (length <= (MHLEN - MXGEFW_PAD))
1934			mxge_rx_done_small(sc, length, checksum);
1935		else
1936			mxge_rx_done_big(sc, length, checksum);
1937		rx_done->cnt++;
1938		rx_done->idx = rx_done->cnt & (mxge_max_intr_slots - 1);
1939
1940		/* limit potential for livelock */
1941		if (__predict_false(++limit > 2 * mxge_max_intr_slots))
1942			break;
1943
1944	}
1945}
1946
1947
1948static inline void
1949mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
1950{
1951	struct ifnet *ifp;
1952	mxge_tx_buf_t *tx;
1953	struct mbuf *m;
1954	bus_dmamap_t map;
1955	int idx, limit;
1956
1957	limit = 0;
1958	tx = &sc->tx;
1959	ifp = sc->ifp;
1960	while (tx->pkt_done != mcp_idx) {
1961		idx = tx->done & tx->mask;
1962		tx->done++;
1963		m = tx->info[idx].m;
1964		/* mbuf and DMA map only attached to the first
1965		   segment per-mbuf */
1966		if (m != NULL) {
1967			ifp->if_opackets++;
1968			tx->info[idx].m = NULL;
1969			map = tx->info[idx].map;
1970			bus_dmamap_unload(tx->dmat, map);
1971			m_freem(m);
1972		}
1973		if (tx->info[idx].flag) {
1974			tx->info[idx].flag = 0;
1975			tx->pkt_done++;
1976		}
1977		/* limit potential for livelock by only handling
1978		   2 full tx rings per call */
1979		if (__predict_false(++limit >  2 * tx->mask))
1980			break;
1981	}
1982
1983	/* If we have space, clear IFF_OACTIVE to tell the stack that
1984           its OK to send packets */
1985
1986	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
1987	    tx->req - tx->done < (tx->mask + 1)/4) {
1988		mtx_lock(&sc->tx_lock);
1989		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
1990		mxge_start_locked(sc);
1991		mtx_unlock(&sc->tx_lock);
1992	}
1993}
1994
1995static void
1996mxge_intr(void *arg)
1997{
1998	mxge_softc_t *sc = arg;
1999	mcp_irq_data_t *stats = sc->fw_stats;
2000	mxge_tx_buf_t *tx = &sc->tx;
2001	mxge_rx_done_t *rx_done = &sc->rx_done;
2002	uint32_t send_done_count;
2003	uint8_t valid;
2004
2005
2006	/* make sure the DMA has finished */
2007	if (!stats->valid) {
2008		return;
2009	}
2010	valid = stats->valid;
2011
2012	/* lower legacy IRQ  */
2013	*sc->irq_deassert = 0;
2014	mb();
2015	if (!mxge_deassert_wait)
2016		/* don't wait for conf. that irq is low */
2017		stats->valid = 0;
2018	do {
2019		/* check for transmit completes and receives */
2020		send_done_count = be32toh(stats->send_done_count);
2021		while ((send_done_count != tx->pkt_done) ||
2022		       (rx_done->entry[rx_done->idx].length != 0)) {
2023			mxge_tx_done(sc, (int)send_done_count);
2024			mxge_clean_rx_done(sc);
2025			send_done_count = be32toh(stats->send_done_count);
2026		}
2027	} while (*((volatile uint8_t *) &stats->valid));
2028
2029	if (__predict_false(stats->stats_updated)) {
2030		if (sc->link_state != stats->link_up) {
2031			sc->link_state = stats->link_up;
2032			if (sc->link_state) {
2033				if_link_state_change(sc->ifp, LINK_STATE_UP);
2034				if (mxge_verbose)
2035					device_printf(sc->dev, "link up\n");
2036			} else {
2037				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2038				if (mxge_verbose)
2039					device_printf(sc->dev, "link down\n");
2040			}
2041		}
2042		if (sc->rdma_tags_available !=
2043		    be32toh(sc->fw_stats->rdma_tags_available)) {
2044			sc->rdma_tags_available =
2045				be32toh(sc->fw_stats->rdma_tags_available);
2046			device_printf(sc->dev, "RDMA timed out! %d tags "
2047				      "left\n", sc->rdma_tags_available);
2048		}
2049		sc->down_cnt += stats->link_down;
2050	}
2051
2052	/* check to see if we have rx token to pass back */
2053	if (valid & 0x1)
2054	    *sc->irq_claim = be32toh(3);
2055	*(sc->irq_claim + 1) = be32toh(3);
2056}
2057
2058static void
2059mxge_watchdog(struct ifnet *ifp)
2060{
2061	printf("%s called\n", __FUNCTION__);
2062}
2063
2064static void
2065mxge_init(void *arg)
2066{
2067}
2068
2069
2070
2071static void
2072mxge_free_mbufs(mxge_softc_t *sc)
2073{
2074	int i;
2075
2076	for (i = 0; i <= sc->rx_big.mask; i++) {
2077		if (sc->rx_big.info[i].m == NULL)
2078			continue;
2079		bus_dmamap_unload(sc->rx_big.dmat,
2080				  sc->rx_big.info[i].map);
2081		m_freem(sc->rx_big.info[i].m);
2082		sc->rx_big.info[i].m = NULL;
2083	}
2084
2085	for (i = 0; i <= sc->rx_big.mask; i++) {
2086		if (sc->rx_big.info[i].m == NULL)
2087			continue;
2088		bus_dmamap_unload(sc->rx_big.dmat,
2089				  sc->rx_big.info[i].map);
2090		m_freem(sc->rx_big.info[i].m);
2091		sc->rx_big.info[i].m = NULL;
2092	}
2093
2094	for (i = 0; i <= sc->tx.mask; i++) {
2095		if (sc->tx.info[i].m == NULL)
2096			continue;
2097		bus_dmamap_unload(sc->tx.dmat,
2098				  sc->tx.info[i].map);
2099		m_freem(sc->tx.info[i].m);
2100		sc->tx.info[i].m = NULL;
2101	}
2102}
2103
2104static void
2105mxge_free_rings(mxge_softc_t *sc)
2106{
2107	int i;
2108
2109	if (sc->tx.req_bytes != NULL)
2110		free(sc->tx.req_bytes, M_DEVBUF);
2111	if (sc->tx.seg_list != NULL)
2112		free(sc->tx.seg_list, M_DEVBUF);
2113	if (sc->rx_small.shadow != NULL)
2114		free(sc->rx_small.shadow, M_DEVBUF);
2115	if (sc->rx_big.shadow != NULL)
2116		free(sc->rx_big.shadow, M_DEVBUF);
2117	if (sc->tx.info != NULL) {
2118		for (i = 0; i <= sc->tx.mask; i++) {
2119			if (sc->tx.info[i].map != NULL)
2120				bus_dmamap_destroy(sc->tx.dmat,
2121						   sc->tx.info[i].map);
2122		}
2123		free(sc->tx.info, M_DEVBUF);
2124	}
2125	if (sc->rx_small.info != NULL) {
2126		for (i = 0; i <= sc->rx_small.mask; i++) {
2127			if (sc->rx_small.info[i].map != NULL)
2128				bus_dmamap_destroy(sc->rx_small.dmat,
2129						   sc->rx_small.info[i].map);
2130		}
2131		free(sc->rx_small.info, M_DEVBUF);
2132	}
2133	if (sc->rx_big.info != NULL) {
2134		for (i = 0; i <= sc->rx_big.mask; i++) {
2135			if (sc->rx_big.info[i].map != NULL)
2136				bus_dmamap_destroy(sc->rx_big.dmat,
2137						   sc->rx_big.info[i].map);
2138		}
2139		free(sc->rx_big.info, M_DEVBUF);
2140	}
2141	if (sc->rx_big.extra_map != NULL)
2142		bus_dmamap_destroy(sc->rx_big.dmat,
2143				   sc->rx_big.extra_map);
2144	if (sc->rx_small.extra_map != NULL)
2145		bus_dmamap_destroy(sc->rx_small.dmat,
2146				   sc->rx_small.extra_map);
2147	if (sc->tx.dmat != NULL)
2148		bus_dma_tag_destroy(sc->tx.dmat);
2149	if (sc->rx_small.dmat != NULL)
2150		bus_dma_tag_destroy(sc->rx_small.dmat);
2151	if (sc->rx_big.dmat != NULL)
2152		bus_dma_tag_destroy(sc->rx_big.dmat);
2153}
2154
2155static int
2156mxge_alloc_rings(mxge_softc_t *sc)
2157{
2158	mxge_cmd_t cmd;
2159	int tx_ring_size, rx_ring_size;
2160	int tx_ring_entries, rx_ring_entries;
2161	int i, err;
2162	unsigned long bytes;
2163
2164	/* get ring sizes */
2165	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2166	tx_ring_size = cmd.data0;
2167	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
2168	if (err != 0) {
2169		device_printf(sc->dev, "Cannot determine ring sizes\n");
2170		goto abort_with_nothing;
2171	}
2172
2173	rx_ring_size = cmd.data0;
2174
2175	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
2176	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
2177	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
2178	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
2179	IFQ_SET_READY(&sc->ifp->if_snd);
2180
2181	sc->tx.mask = tx_ring_entries - 1;
2182	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
2183
2184	err = ENOMEM;
2185
2186	/* allocate the tx request copy block */
2187	bytes = 8 +
2188		sizeof (*sc->tx.req_list) * (MXGE_MAX_SEND_DESC + 4);
2189	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2190	if (sc->tx.req_bytes == NULL)
2191		goto abort_with_nothing;
2192	/* ensure req_list entries are aligned to 8 bytes */
2193	sc->tx.req_list = (mcp_kreq_ether_send_t *)
2194		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
2195
2196	/* allocate the tx busdma segment list */
2197	bytes = sizeof (*sc->tx.seg_list) * MXGE_MAX_SEND_DESC;
2198	sc->tx.seg_list = (bus_dma_segment_t *)
2199		malloc(bytes, M_DEVBUF, M_WAITOK);
2200	if (sc->tx.seg_list == NULL)
2201		goto abort_with_alloc;
2202
2203	/* allocate the rx shadow rings */
2204	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
2205	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2206	if (sc->rx_small.shadow == NULL)
2207		goto abort_with_alloc;
2208
2209	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
2210	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2211	if (sc->rx_big.shadow == NULL)
2212		goto abort_with_alloc;
2213
2214	/* allocate the host info rings */
2215	bytes = tx_ring_entries * sizeof (*sc->tx.info);
2216	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2217	if (sc->tx.info == NULL)
2218		goto abort_with_alloc;
2219
2220	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
2221	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2222	if (sc->rx_small.info == NULL)
2223		goto abort_with_alloc;
2224
2225	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
2226	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2227	if (sc->rx_big.info == NULL)
2228		goto abort_with_alloc;
2229
2230	/* allocate the busdma resources */
2231	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2232				 1,			/* alignment */
2233				 sc->tx.boundary,	/* boundary */
2234				 BUS_SPACE_MAXADDR,	/* low */
2235				 BUS_SPACE_MAXADDR,	/* high */
2236				 NULL, NULL,		/* filter */
2237				 65536 + 256,		/* maxsize */
2238				 MXGE_MAX_SEND_DESC/2,	/* num segs */
2239				 sc->tx.boundary,	/* maxsegsize */
2240				 BUS_DMA_ALLOCNOW,	/* flags */
2241				 NULL, NULL,		/* lock */
2242				 &sc->tx.dmat);		/* tag */
2243
2244	if (err != 0) {
2245		device_printf(sc->dev, "Err %d allocating tx dmat\n",
2246			      err);
2247		goto abort_with_alloc;
2248	}
2249
2250	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2251				 1,			/* alignment */
2252				 4096,			/* boundary */
2253				 BUS_SPACE_MAXADDR,	/* low */
2254				 BUS_SPACE_MAXADDR,	/* high */
2255				 NULL, NULL,		/* filter */
2256				 MHLEN,			/* maxsize */
2257				 1,			/* num segs */
2258				 MHLEN,			/* maxsegsize */
2259				 BUS_DMA_ALLOCNOW,	/* flags */
2260				 NULL, NULL,		/* lock */
2261				 &sc->rx_small.dmat);	/* tag */
2262	if (err != 0) {
2263		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2264			      err);
2265		goto abort_with_alloc;
2266	}
2267
2268	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2269				 1,			/* alignment */
2270				 4096,			/* boundary */
2271				 BUS_SPACE_MAXADDR,	/* low */
2272				 BUS_SPACE_MAXADDR,	/* high */
2273				 NULL, NULL,		/* filter */
2274				 4096,			/* maxsize */
2275				 1,			/* num segs */
2276				 4096,			/* maxsegsize */
2277				 BUS_DMA_ALLOCNOW,	/* flags */
2278				 NULL, NULL,		/* lock */
2279				 &sc->rx_big.dmat);	/* tag */
2280	if (err != 0) {
2281		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2282			      err);
2283		goto abort_with_alloc;
2284	}
2285
2286	/* now use these tags to setup dmamaps for each slot
2287	   in each ring */
2288	for (i = 0; i <= sc->tx.mask; i++) {
2289		err = bus_dmamap_create(sc->tx.dmat, 0,
2290					&sc->tx.info[i].map);
2291		if (err != 0) {
2292			device_printf(sc->dev, "Err %d  tx dmamap\n",
2293			      err);
2294			goto abort_with_alloc;
2295		}
2296	}
2297	for (i = 0; i <= sc->rx_small.mask; i++) {
2298		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2299					&sc->rx_small.info[i].map);
2300		if (err != 0) {
2301			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2302				      err);
2303			goto abort_with_alloc;
2304		}
2305	}
2306	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2307				&sc->rx_small.extra_map);
2308	if (err != 0) {
2309		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2310			      err);
2311			goto abort_with_alloc;
2312	}
2313
2314	for (i = 0; i <= sc->rx_big.mask; i++) {
2315		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2316					&sc->rx_big.info[i].map);
2317		if (err != 0) {
2318			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2319			      err);
2320			goto abort_with_alloc;
2321		}
2322	}
2323	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2324				&sc->rx_big.extra_map);
2325	if (err != 0) {
2326		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2327			      err);
2328			goto abort_with_alloc;
2329	}
2330	return 0;
2331
2332abort_with_alloc:
2333	mxge_free_rings(sc);
2334
2335abort_with_nothing:
2336	return err;
2337}
2338
2339static int
2340mxge_open(mxge_softc_t *sc)
2341{
2342	mxge_cmd_t cmd;
2343	int i, err;
2344	bus_dmamap_t map;
2345	bus_addr_t bus;
2346
2347
2348	/* Copy the MAC address in case it was overridden */
2349	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
2350
2351	err = mxge_reset(sc);
2352	if (err != 0) {
2353		device_printf(sc->dev, "failed to reset\n");
2354		return EIO;
2355	}
2356
2357	if (MCLBYTES >=
2358	    sc->ifp->if_mtu + ETHER_HDR_LEN + MXGEFW_PAD)
2359		sc->big_bytes = MCLBYTES;
2360	else
2361		sc->big_bytes = MJUMPAGESIZE;
2362
2363	err = mxge_alloc_rings(sc);
2364	if (err != 0) {
2365		device_printf(sc->dev, "failed to allocate rings\n");
2366		return err;
2367	}
2368
2369	err = bus_setup_intr(sc->dev, sc->irq_res,
2370			     INTR_TYPE_NET | INTR_MPSAFE,
2371			     mxge_intr, sc, &sc->ih);
2372	if (err != 0) {
2373		goto abort_with_rings;
2374	}
2375
2376	/* get the lanai pointers to the send and receive rings */
2377
2378	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2379	sc->tx.lanai =
2380		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2381	err |= mxge_send_cmd(sc,
2382				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2383	sc->rx_small.lanai =
2384		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2385	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2386	sc->rx_big.lanai =
2387		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2388
2389	if (err != 0) {
2390		device_printf(sc->dev,
2391			      "failed to get ring sizes or locations\n");
2392		err = EIO;
2393		goto abort_with_irq;
2394	}
2395
2396	if (sc->wc) {
2397		sc->tx.wc_fifo = sc->sram + MXGEFW_ETH_SEND_4;
2398		sc->rx_small.wc_fifo = sc->sram + MXGEFW_ETH_RECV_SMALL;
2399		sc->rx_big.wc_fifo = sc->sram + MXGEFW_ETH_RECV_BIG;
2400	} else {
2401		sc->tx.wc_fifo = 0;
2402		sc->rx_small.wc_fifo = 0;
2403		sc->rx_big.wc_fifo = 0;
2404	}
2405
2406
2407	/* stock receive rings */
2408	for (i = 0; i <= sc->rx_small.mask; i++) {
2409		map = sc->rx_small.info[i].map;
2410		err = mxge_get_buf_small(sc, map, i);
2411		if (err) {
2412			device_printf(sc->dev, "alloced %d/%d smalls\n",
2413				      i, sc->rx_small.mask + 1);
2414			goto abort;
2415		}
2416	}
2417	for (i = 0; i <= sc->rx_big.mask; i++) {
2418		map = sc->rx_big.info[i].map;
2419		err = mxge_get_buf_big(sc, map, i);
2420		if (err) {
2421			device_printf(sc->dev, "alloced %d/%d bigs\n",
2422				      i, sc->rx_big.mask + 1);
2423			goto abort;
2424		}
2425	}
2426
2427	/* Give the firmware the mtu and the big and small buffer
2428	   sizes.  The firmware wants the big buf size to be a power
2429	   of two. Luckily, FreeBSD's clusters are powers of two */
2430	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN;
2431	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2432	cmd.data0 = MHLEN - MXGEFW_PAD;
2433	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2434			     &cmd);
2435	cmd.data0 = sc->big_bytes;
2436	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2437
2438	if (err != 0) {
2439		device_printf(sc->dev, "failed to setup params\n");
2440		goto abort;
2441	}
2442
2443	/* Now give him the pointer to the stats block */
2444	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
2445	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
2446	cmd.data2 = sizeof(struct mcp_irq_data);
2447	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
2448
2449	if (err != 0) {
2450		bus = sc->fw_stats_dma.bus_addr;
2451		bus += offsetof(struct mcp_irq_data, send_done_count);
2452		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
2453		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
2454		err = mxge_send_cmd(sc,
2455				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
2456				    &cmd);
2457		/* Firmware cannot support multicast without STATS_DMA_V2 */
2458		sc->fw_multicast_support = 0;
2459	} else {
2460		sc->fw_multicast_support = 1;
2461	}
2462
2463	if (err != 0) {
2464		device_printf(sc->dev, "failed to setup params\n");
2465		goto abort;
2466	}
2467
2468	/* Finally, start the firmware running */
2469	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
2470	if (err) {
2471		device_printf(sc->dev, "Couldn't bring up link\n");
2472		goto abort;
2473	}
2474	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
2475	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2476
2477	return 0;
2478
2479
2480abort:
2481	mxge_free_mbufs(sc);
2482abort_with_irq:
2483	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
2484abort_with_rings:
2485	mxge_free_rings(sc);
2486	return err;
2487}
2488
2489static int
2490mxge_close(mxge_softc_t *sc)
2491{
2492	mxge_cmd_t cmd;
2493	int err, old_down_cnt;
2494
2495	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2496	old_down_cnt = sc->down_cnt;
2497	mb();
2498	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2499	if (err) {
2500		device_printf(sc->dev, "Couldn't bring down link\n");
2501	}
2502	if (old_down_cnt == sc->down_cnt) {
2503		/* wait for down irq */
2504		(void)tsleep(&sc->down_cnt, PWAIT, "down mxge", hz);
2505	}
2506	if (old_down_cnt == sc->down_cnt) {
2507		device_printf(sc->dev, "never got down irq\n");
2508	}
2509	if (sc->ih != NULL)
2510		bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
2511	mxge_free_mbufs(sc);
2512	mxge_free_rings(sc);
2513	return 0;
2514}
2515
2516
2517static int
2518mxge_media_change(struct ifnet *ifp)
2519{
2520	return EINVAL;
2521}
2522
2523static int
2524mxge_change_mtu(mxge_softc_t *sc, int mtu)
2525{
2526	struct ifnet *ifp = sc->ifp;
2527	int real_mtu, old_mtu;
2528	int err = 0;
2529
2530
2531	real_mtu = mtu + ETHER_HDR_LEN;
2532	if ((real_mtu > MXGE_MAX_ETHER_MTU) ||
2533	    real_mtu < 60)
2534		return EINVAL;
2535	sx_xlock(&sc->driver_lock);
2536	old_mtu = ifp->if_mtu;
2537	ifp->if_mtu = mtu;
2538	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2539		mxge_close(sc);
2540		err = mxge_open(sc);
2541		if (err != 0) {
2542			ifp->if_mtu = old_mtu;
2543			mxge_close(sc);
2544			(void) mxge_open(sc);
2545		}
2546	}
2547	sx_xunlock(&sc->driver_lock);
2548	return err;
2549}
2550
2551static void
2552mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
2553{
2554	mxge_softc_t *sc = ifp->if_softc;
2555
2556
2557	if (sc == NULL)
2558		return;
2559	ifmr->ifm_status = IFM_AVALID;
2560	ifmr->ifm_status |= sc->fw_stats->link_up ? IFM_ACTIVE : 0;
2561	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
2562	ifmr->ifm_active |= sc->fw_stats->link_up ? IFM_FDX : 0;
2563}
2564
2565static int
2566mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
2567{
2568	mxge_softc_t *sc = ifp->if_softc;
2569	struct ifreq *ifr = (struct ifreq *)data;
2570	int err, mask;
2571
2572	err = 0;
2573	switch (command) {
2574	case SIOCSIFADDR:
2575	case SIOCGIFADDR:
2576		err = ether_ioctl(ifp, command, data);
2577		break;
2578
2579	case SIOCSIFMTU:
2580		err = mxge_change_mtu(sc, ifr->ifr_mtu);
2581		break;
2582
2583	case SIOCSIFFLAGS:
2584		sx_xlock(&sc->driver_lock);
2585		if (ifp->if_flags & IFF_UP) {
2586			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2587				err = mxge_open(sc);
2588			else {
2589				/* take care of promis can allmulti
2590				   flag chages */
2591				mxge_change_promisc(sc,
2592						    ifp->if_flags & IFF_PROMISC);
2593				mxge_set_multicast_list(sc);
2594			}
2595		} else {
2596			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2597				mxge_close(sc);
2598		}
2599		sx_xunlock(&sc->driver_lock);
2600		break;
2601
2602	case SIOCADDMULTI:
2603	case SIOCDELMULTI:
2604		sx_xlock(&sc->driver_lock);
2605		mxge_set_multicast_list(sc);
2606		sx_xunlock(&sc->driver_lock);
2607		break;
2608
2609	case SIOCSIFCAP:
2610		sx_xlock(&sc->driver_lock);
2611		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2612		if (mask & IFCAP_TXCSUM) {
2613			if (IFCAP_TXCSUM & ifp->if_capenable) {
2614				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
2615				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
2616						      | CSUM_TSO);
2617			} else {
2618				ifp->if_capenable |= IFCAP_TXCSUM;
2619				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
2620			}
2621		} else if (mask & IFCAP_RXCSUM) {
2622			if (IFCAP_RXCSUM & ifp->if_capenable) {
2623				ifp->if_capenable &= ~IFCAP_RXCSUM;
2624				sc->csum_flag = 0;
2625			} else {
2626				ifp->if_capenable |= IFCAP_RXCSUM;
2627				sc->csum_flag = 1;
2628			}
2629		}
2630		if (mask & IFCAP_TSO4) {
2631			if (IFCAP_TSO4 & ifp->if_capenable) {
2632				ifp->if_capenable &= ~IFCAP_TSO4;
2633				ifp->if_hwassist &= ~CSUM_TSO;
2634			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
2635				ifp->if_capenable |= IFCAP_TSO4;
2636				ifp->if_hwassist |= CSUM_TSO;
2637			} else {
2638				printf("mxge requires tx checksum offload"
2639				       " be enabled to use TSO\n");
2640				err = EINVAL;
2641			}
2642		}
2643		sx_xunlock(&sc->driver_lock);
2644		break;
2645
2646	case SIOCGIFMEDIA:
2647		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
2648				    &sc->media, command);
2649                break;
2650
2651	default:
2652		err = ENOTTY;
2653        }
2654	return err;
2655}
2656
2657static void
2658mxge_fetch_tunables(mxge_softc_t *sc)
2659{
2660
2661	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
2662			  &mxge_flow_control);
2663	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
2664			  &mxge_intr_coal_delay);
2665	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
2666			  &mxge_nvidia_ecrc_enable);
2667	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
2668			  &mxge_deassert_wait);
2669	TUNABLE_INT_FETCH("hw.mxge.verbose",
2670			  &mxge_verbose);
2671
2672	if (bootverbose)
2673		mxge_verbose = 1;
2674	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
2675		mxge_intr_coal_delay = 30;
2676	sc->pause = mxge_flow_control;
2677}
2678
2679static int
2680mxge_attach(device_t dev)
2681{
2682	mxge_softc_t *sc = device_get_softc(dev);
2683	struct ifnet *ifp;
2684	size_t bytes;
2685	int rid, err;
2686	uint16_t cmd;
2687
2688	sc->dev = dev;
2689	mxge_fetch_tunables(sc);
2690
2691	err = bus_dma_tag_create(NULL,			/* parent */
2692				 1,			/* alignment */
2693				 4096,			/* boundary */
2694				 BUS_SPACE_MAXADDR,	/* low */
2695				 BUS_SPACE_MAXADDR,	/* high */
2696				 NULL, NULL,		/* filter */
2697				 65536 + 256,		/* maxsize */
2698				 MXGE_MAX_SEND_DESC, 	/* num segs */
2699				 4096,			/* maxsegsize */
2700				 0,			/* flags */
2701				 NULL, NULL,		/* lock */
2702				 &sc->parent_dmat);	/* tag */
2703
2704	if (err != 0) {
2705		device_printf(sc->dev, "Err %d allocating parent dmat\n",
2706			      err);
2707		goto abort_with_nothing;
2708	}
2709
2710	ifp = sc->ifp = if_alloc(IFT_ETHER);
2711	if (ifp == NULL) {
2712		device_printf(dev, "can not if_alloc()\n");
2713		err = ENOSPC;
2714		goto abort_with_parent_dmat;
2715	}
2716	mtx_init(&sc->cmd_lock, NULL,
2717		 MTX_NETWORK_LOCK, MTX_DEF);
2718	mtx_init(&sc->tx_lock, device_get_nameunit(dev),
2719		 MTX_NETWORK_LOCK, MTX_DEF);
2720	sx_init(&sc->driver_lock, device_get_nameunit(dev));
2721
2722	/* Enable DMA and Memory space access */
2723	pci_enable_busmaster(dev);
2724	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
2725	cmd |= PCIM_CMD_MEMEN;
2726	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
2727
2728	/* Map the board into the kernel */
2729	rid = PCIR_BARS;
2730	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
2731					 ~0, 1, RF_ACTIVE);
2732	if (sc->mem_res == NULL) {
2733		device_printf(dev, "could not map memory\n");
2734		err = ENXIO;
2735		goto abort_with_lock;
2736	}
2737	sc->sram = rman_get_virtual(sc->mem_res);
2738	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
2739	if (sc->sram_size > rman_get_size(sc->mem_res)) {
2740		device_printf(dev, "impossible memory region size %ld\n",
2741			      rman_get_size(sc->mem_res));
2742		err = ENXIO;
2743		goto abort_with_mem_res;
2744	}
2745
2746	/* make NULL terminated copy of the EEPROM strings section of
2747	   lanai SRAM */
2748	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
2749	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
2750				rman_get_bushandle(sc->mem_res),
2751				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
2752				sc->eeprom_strings,
2753				MXGE_EEPROM_STRINGS_SIZE - 2);
2754	err = mxge_parse_strings(sc);
2755	if (err != 0)
2756		goto abort_with_mem_res;
2757
2758	/* Enable write combining for efficient use of PCIe bus */
2759	mxge_enable_wc(sc);
2760
2761	/* Allocate the out of band dma memory */
2762	err = mxge_dma_alloc(sc, &sc->cmd_dma,
2763			     sizeof (mxge_cmd_t), 64);
2764	if (err != 0)
2765		goto abort_with_mem_res;
2766	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
2767	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
2768	if (err != 0)
2769		goto abort_with_cmd_dma;
2770
2771	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
2772			     sizeof (*sc->fw_stats), 64);
2773	if (err != 0)
2774		goto abort_with_zeropad_dma;
2775	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
2776
2777
2778	/* allocate interrupt queues */
2779	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);
2780	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
2781	if (err != 0)
2782		goto abort_with_fw_stats;
2783	sc->rx_done.entry = sc->rx_done.dma.addr;
2784	bzero(sc->rx_done.entry, bytes);
2785	/* Add our ithread  */
2786	rid = 0;
2787	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
2788					 1, RF_SHAREABLE | RF_ACTIVE);
2789	if (sc->irq_res == NULL) {
2790		device_printf(dev, "could not alloc interrupt\n");
2791		goto abort_with_rx_done;
2792	}
2793
2794	/* load the firmware */
2795	mxge_select_firmware(sc);
2796
2797	err = mxge_load_firmware(sc);
2798	if (err != 0)
2799		goto abort_with_irq_res;
2800	sc->intr_coal_delay = mxge_intr_coal_delay;
2801	err = mxge_reset(sc);
2802	if (err != 0)
2803		goto abort_with_irq_res;
2804
2805	/* hook into the network stack */
2806	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2807	ifp->if_baudrate = 100000000;
2808	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4;
2809	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
2810	ifp->if_capenable = ifp->if_capabilities;
2811	sc->csum_flag = 1;
2812        ifp->if_init = mxge_init;
2813        ifp->if_softc = sc;
2814        ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2815        ifp->if_ioctl = mxge_ioctl;
2816        ifp->if_start = mxge_start;
2817	ifp->if_watchdog = mxge_watchdog;
2818	ether_ifattach(ifp, sc->mac_addr);
2819	/* ether_ifattach sets mtu to 1500 */
2820	ifp->if_mtu = MXGE_MAX_ETHER_MTU - ETHER_HDR_LEN;
2821
2822	/* Initialise the ifmedia structure */
2823	ifmedia_init(&sc->media, 0, mxge_media_change,
2824		     mxge_media_status);
2825	ifmedia_add(&sc->media, IFM_ETHER|IFM_AUTO, 0, NULL);
2826	mxge_add_sysctls(sc);
2827	return 0;
2828
2829abort_with_irq_res:
2830	bus_release_resource(dev, SYS_RES_IRQ, 0, sc->irq_res);
2831abort_with_rx_done:
2832	sc->rx_done.entry = NULL;
2833	mxge_dma_free(&sc->rx_done.dma);
2834abort_with_fw_stats:
2835	mxge_dma_free(&sc->fw_stats_dma);
2836abort_with_zeropad_dma:
2837	mxge_dma_free(&sc->zeropad_dma);
2838abort_with_cmd_dma:
2839	mxge_dma_free(&sc->cmd_dma);
2840abort_with_mem_res:
2841	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
2842abort_with_lock:
2843	pci_disable_busmaster(dev);
2844	mtx_destroy(&sc->cmd_lock);
2845	mtx_destroy(&sc->tx_lock);
2846	sx_destroy(&sc->driver_lock);
2847	if_free(ifp);
2848abort_with_parent_dmat:
2849	bus_dma_tag_destroy(sc->parent_dmat);
2850
2851abort_with_nothing:
2852	return err;
2853}
2854
2855static int
2856mxge_detach(device_t dev)
2857{
2858	mxge_softc_t *sc = device_get_softc(dev);
2859
2860	sx_xlock(&sc->driver_lock);
2861	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
2862		mxge_close(sc);
2863	sx_xunlock(&sc->driver_lock);
2864	ether_ifdetach(sc->ifp);
2865	mxge_dummy_rdma(sc, 0);
2866	bus_release_resource(dev, SYS_RES_IRQ, 0, sc->irq_res);
2867	sc->rx_done.entry = NULL;
2868	mxge_dma_free(&sc->rx_done.dma);
2869	mxge_dma_free(&sc->fw_stats_dma);
2870	mxge_dma_free(&sc->zeropad_dma);
2871	mxge_dma_free(&sc->cmd_dma);
2872	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
2873	pci_disable_busmaster(dev);
2874	mtx_destroy(&sc->cmd_lock);
2875	mtx_destroy(&sc->tx_lock);
2876	sx_destroy(&sc->driver_lock);
2877	if_free(sc->ifp);
2878	bus_dma_tag_destroy(sc->parent_dmat);
2879	return 0;
2880}
2881
2882static int
2883mxge_shutdown(device_t dev)
2884{
2885	return 0;
2886}
2887
2888/*
2889  This file uses Myri10GE driver indentation.
2890
2891  Local Variables:
2892  c-file-style:"linux"
2893  tab-width:8
2894  End:
2895*/
2896