xref: /dflybsd-src/sys/dev/netif/mxge/if_mxge.c (revision ff837cd5d97df9cfc8d66fc9ebc73557657d023e)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 $FreeBSD: head/sys/dev/mxge/if_mxge.c 254263 2013-08-12 23:30:01Z scottl $
29 
30 ***************************************************************************/
31 
32 #include "opt_inet.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/in_cksum.h>
40 #include <sys/sockio.h>
41 #include <sys/mbuf.h>
42 #include <sys/malloc.h>
43 #include <sys/kernel.h>
44 #include <sys/module.h>
45 #include <sys/serialize.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 
49 #include <net/if.h>
50 #include <net/if_arp.h>
51 #include <net/ifq_var.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
55 
56 #include <net/bpf.h>
57 
58 #include <net/if_types.h>
59 #include <net/vlan/if_vlan_var.h>
60 #include <net/zlib.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
66 
67 #include <sys/bus.h>
68 #include <sys/rman.h>
69 
70 #include <bus/pci/pcireg.h>
71 #include <bus/pci/pcivar.h>
72 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
73 
74 #include <vm/vm.h>		/* for pmap_mapdev() */
75 #include <vm/pmap.h>
76 
77 #if defined(__i386__) || defined(__x86_64__)
78 #include <machine/specialreg.h>
79 #endif
80 
81 #include <dev/netif/mxge/mxge_mcp.h>
82 #include <dev/netif/mxge/mcp_gen_header.h>
83 #include <dev/netif/mxge/if_mxge_var.h>
84 
85 #define MXGE_RX_SMALL_BUFLEN		(MHLEN - MXGEFW_PAD)
86 
87 /* tunable params */
88 static int mxge_nvidia_ecrc_enable = 1;
89 static int mxge_force_firmware = 0;
90 static int mxge_intr_coal_delay = MXGE_INTR_COAL_DELAY;
91 static int mxge_deassert_wait = 1;
92 static int mxge_flow_control = 1;
93 static int mxge_ticks;
94 static int mxge_max_slices = 1;
95 static int mxge_always_promisc = 0;
96 static int mxge_throttle = 0;
97 static int mxge_msi_enable = 1;
98 
99 static const char *mxge_fw_unaligned = "mxge_ethp_z8e";
100 static const char *mxge_fw_aligned = "mxge_eth_z8e";
101 static const char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
102 static const char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
103 
104 TUNABLE_INT("hw.mxge.max_slices", &mxge_max_slices);
105 TUNABLE_INT("hw.mxge.flow_control_enabled", &mxge_flow_control);
106 TUNABLE_INT("hw.mxge.intr_coal_delay", &mxge_intr_coal_delay);
107 TUNABLE_INT("hw.mxge.nvidia_ecrc_enable", &mxge_nvidia_ecrc_enable);
108 TUNABLE_INT("hw.mxge.force_firmware", &mxge_force_firmware);
109 TUNABLE_INT("hw.mxge.deassert_wait", &mxge_deassert_wait);
110 TUNABLE_INT("hw.mxge.ticks", &mxge_ticks);
111 TUNABLE_INT("hw.mxge.always_promisc", &mxge_always_promisc);
112 TUNABLE_INT("hw.mxge.throttle", &mxge_throttle);
113 TUNABLE_INT("hw.mxge.msi.enable", &mxge_msi_enable);
114 
115 static int mxge_probe(device_t dev);
116 static int mxge_attach(device_t dev);
117 static int mxge_detach(device_t dev);
118 static int mxge_shutdown(device_t dev);
119 
120 static device_method_t mxge_methods[] = {
121 	/* Device interface */
122 	DEVMETHOD(device_probe, mxge_probe),
123 	DEVMETHOD(device_attach, mxge_attach),
124 	DEVMETHOD(device_detach, mxge_detach),
125 	DEVMETHOD(device_shutdown, mxge_shutdown),
126 	DEVMETHOD_END
127 };
128 
129 static driver_t mxge_driver = {
130 	"mxge",
131 	mxge_methods,
132 	sizeof(mxge_softc_t),
133 };
134 
135 static devclass_t mxge_devclass;
136 
137 /* Declare ourselves to be a child of the PCI bus.*/
138 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
139 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
140 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
141 
142 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
143 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
144 static void mxge_close(mxge_softc_t *sc, int down);
145 static int mxge_open(mxge_softc_t *sc);
146 static void mxge_tick(void *arg);
147 static void mxge_watchdog_reset(mxge_softc_t *sc);
148 static void mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice);
149 
150 static int
151 mxge_probe(device_t dev)
152 {
153 	if (pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM &&
154 	    (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E ||
155 	     pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9)) {
156 		int rev = pci_get_revid(dev);
157 
158 		switch (rev) {
159 		case MXGE_PCI_REV_Z8E:
160 			device_set_desc(dev, "Myri10G-PCIE-8A");
161 			break;
162 		case MXGE_PCI_REV_Z8ES:
163 			device_set_desc(dev, "Myri10G-PCIE-8B");
164 			break;
165 		default:
166 			device_set_desc(dev, "Myri10G-PCIE-8??");
167 			device_printf(dev, "Unrecognized rev %d NIC\n", rev);
168 			break;
169 		}
170 		return 0;
171 	}
172 	return ENXIO;
173 }
174 
175 static void
176 mxge_enable_wc(mxge_softc_t *sc)
177 {
178 #if defined(__i386__) || defined(__x86_64__)
179 	vm_offset_t len;
180 
181 	sc->wc = 1;
182 	len = rman_get_size(sc->mem_res);
183 	pmap_change_attr((vm_offset_t) sc->sram, len / PAGE_SIZE,
184 	    PAT_WRITE_COMBINING);
185 #endif
186 }
187 
188 static int
189 mxge_dma_alloc(mxge_softc_t *sc, bus_dmamem_t *dma, size_t bytes,
190     bus_size_t alignment)
191 {
192 	bus_size_t boundary;
193 	int err;
194 
195 	if (bytes > 4096 && alignment == 4096)
196 		boundary = 0;
197 	else
198 		boundary = 4096;
199 
200 	err = bus_dmamem_coherent(sc->parent_dmat, alignment, boundary,
201 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, bytes,
202 	    BUS_DMA_WAITOK | BUS_DMA_ZERO, dma);
203 	if (err != 0) {
204 		device_printf(sc->dev, "bus_dmamem_coherent failed: %d\n", err);
205 		return err;
206 	}
207 	return 0;
208 }
209 
210 static void
211 mxge_dma_free(bus_dmamem_t *dma)
212 {
213 	bus_dmamap_unload(dma->dmem_tag, dma->dmem_map);
214 	bus_dmamem_free(dma->dmem_tag, dma->dmem_addr, dma->dmem_map);
215 	bus_dma_tag_destroy(dma->dmem_tag);
216 }
217 
218 /*
219  * The eeprom strings on the lanaiX have the format
220  * SN=x\0
221  * MAC=x:x:x:x:x:x\0
222  * PC=text\0
223  */
224 static int
225 mxge_parse_strings(mxge_softc_t *sc)
226 {
227 	const char *ptr;
228 	int i, found_mac, found_sn2;
229 	char *endptr;
230 
231 	ptr = sc->eeprom_strings;
232 	found_mac = 0;
233 	found_sn2 = 0;
234 	while (*ptr != '\0') {
235 		if (strncmp(ptr, "MAC=", 4) == 0) {
236 			ptr += 4;
237 			for (i = 0;;) {
238 				sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
239 				if (endptr - ptr != 2)
240 					goto abort;
241 				ptr = endptr;
242 				if (++i == 6)
243 					break;
244 				if (*ptr++ != ':')
245 					goto abort;
246 			}
247 			found_mac = 1;
248 		} else if (strncmp(ptr, "PC=", 3) == 0) {
249 			ptr += 3;
250 			strlcpy(sc->product_code_string, ptr,
251 			    sizeof(sc->product_code_string));
252 		} else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
253 			ptr += 3;
254 			strlcpy(sc->serial_number_string, ptr,
255 			    sizeof(sc->serial_number_string));
256 		} else if (strncmp(ptr, "SN2=", 4) == 0) {
257 			/* SN2 takes precedence over SN */
258 			ptr += 4;
259 			found_sn2 = 1;
260 			strlcpy(sc->serial_number_string, ptr,
261 			    sizeof(sc->serial_number_string));
262 		}
263 		while (*ptr++ != '\0') {}
264 	}
265 
266 	if (found_mac)
267 		return 0;
268 
269 abort:
270 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
271 	return ENXIO;
272 }
273 
274 #if defined(__i386__) || defined(__x86_64__)
275 
276 static void
277 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
278 {
279 	uint32_t val;
280 	unsigned long base, off;
281 	char *va, *cfgptr;
282 	device_t pdev, mcp55;
283 	uint16_t vendor_id, device_id, word;
284 	uintptr_t bus, slot, func, ivend, idev;
285 	uint32_t *ptr32;
286 
287 	if (!mxge_nvidia_ecrc_enable)
288 		return;
289 
290 	pdev = device_get_parent(device_get_parent(sc->dev));
291 	if (pdev == NULL) {
292 		device_printf(sc->dev, "could not find parent?\n");
293 		return;
294 	}
295 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
296 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
297 
298 	if (vendor_id != 0x10de)
299 		return;
300 
301 	base = 0;
302 
303 	if (device_id == 0x005d) {
304 		/* ck804, base address is magic */
305 		base = 0xe0000000UL;
306 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
307 		/* mcp55, base address stored in chipset */
308 		mcp55 = pci_find_bsf(0, 0, 0);
309 		if (mcp55 &&
310 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
311 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
312 			word = pci_read_config(mcp55, 0x90, 2);
313 			base = ((unsigned long)word & 0x7ffeU) << 25;
314 		}
315 	}
316 	if (!base)
317 		return;
318 
319 	/*
320 	 * XXXX
321 	 * Test below is commented because it is believed that doing
322 	 * config read/write beyond 0xff will access the config space
323 	 * for the next larger function.  Uncomment this and remove
324 	 * the hacky pmap_mapdev() way of accessing config space when
325 	 * DragonFly grows support for extended pcie config space access.
326 	 */
327 #if 0
328 	/*
329 	 * See if we can, by some miracle, access the extended
330 	 * config space
331 	 */
332 	val = pci_read_config(pdev, 0x178, 4);
333 	if (val != 0xffffffff) {
334 		val |= 0x40;
335 		pci_write_config(pdev, 0x178, val, 4);
336 		return;
337 	}
338 #endif
339 	/*
340 	 * Rather than using normal pci config space writes, we must
341 	 * map the Nvidia config space ourselves.  This is because on
342 	 * opteron/nvidia class machine the 0xe000000 mapping is
343 	 * handled by the nvidia chipset, that means the internal PCI
344 	 * device (the on-chip northbridge), or the amd-8131 bridge
345 	 * and things behind them are not visible by this method.
346 	 */
347 
348 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
349 		      PCI_IVAR_BUS, &bus);
350 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
351 		      PCI_IVAR_SLOT, &slot);
352 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
353 		      PCI_IVAR_FUNCTION, &func);
354 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
355 		      PCI_IVAR_VENDOR, &ivend);
356 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
357 		      PCI_IVAR_DEVICE, &idev);
358 
359 	off =  base + 0x00100000UL * (unsigned long)bus +
360 	    0x00001000UL * (unsigned long)(func + 8 * slot);
361 
362 	/* map it into the kernel */
363 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
364 	if (va == NULL) {
365 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
366 		return;
367 	}
368 	/* get a pointer to the config space mapped into the kernel */
369 	cfgptr = va + (off & PAGE_MASK);
370 
371 	/* make sure that we can really access it */
372 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
373 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
374 	if (!(vendor_id == ivend && device_id == idev)) {
375 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
376 		    vendor_id, device_id);
377 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
378 		return;
379 	}
380 
381 	ptr32 = (uint32_t*)(cfgptr + 0x178);
382 	val = *ptr32;
383 
384 	if (val == 0xffffffff) {
385 		device_printf(sc->dev, "extended mapping failed\n");
386 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
387 		return;
388 	}
389 	*ptr32 = val | 0x40;
390 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
391 	if (bootverbose) {
392 		device_printf(sc->dev, "Enabled ECRC on upstream "
393 		    "Nvidia bridge at %d:%d:%d\n",
394 		    (int)bus, (int)slot, (int)func);
395 	}
396 }
397 
398 #else	/* __i386__ || __x86_64__ */
399 
400 static void
401 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
402 {
403 	device_printf(sc->dev, "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
404 }
405 
406 #endif
407 
408 static int
409 mxge_dma_test(mxge_softc_t *sc, int test_type)
410 {
411 	mxge_cmd_t cmd;
412 	bus_addr_t dmatest_bus = sc->dmabench_dma.dmem_busaddr;
413 	int status;
414 	uint32_t len;
415 	const char *test = " ";
416 
417 	/*
418 	 * Run a small DMA test.
419 	 * The magic multipliers to the length tell the firmware
420 	 * to do DMA read, write, or read+write tests.  The
421 	 * results are returned in cmd.data0.  The upper 16
422 	 * bits of the return is the number of transfers completed.
423 	 * The lower 16 bits is the time in 0.5us ticks that the
424 	 * transfers took to complete.
425 	 */
426 
427 	len = sc->tx_boundary;
428 
429 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
430 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
431 	cmd.data2 = len * 0x10000;
432 	status = mxge_send_cmd(sc, test_type, &cmd);
433 	if (status != 0) {
434 		test = "read";
435 		goto abort;
436 	}
437 	sc->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
438 
439 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
440 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
441 	cmd.data2 = len * 0x1;
442 	status = mxge_send_cmd(sc, test_type, &cmd);
443 	if (status != 0) {
444 		test = "write";
445 		goto abort;
446 	}
447 	sc->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
448 
449 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
450 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
451 	cmd.data2 = len * 0x10001;
452 	status = mxge_send_cmd(sc, test_type, &cmd);
453 	if (status != 0) {
454 		test = "read/write";
455 		goto abort;
456 	}
457 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
458 	    (cmd.data0 & 0xffff);
459 
460 abort:
461 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST) {
462 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
463 		    test, status);
464 	}
465 	return status;
466 }
467 
468 /*
469  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
470  * when the PCI-E Completion packets are aligned on an 8-byte
471  * boundary.  Some PCI-E chip sets always align Completion packets; on
472  * the ones that do not, the alignment can be enforced by enabling
473  * ECRC generation (if supported).
474  *
475  * When PCI-E Completion packets are not aligned, it is actually more
476  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
477  *
478  * If the driver can neither enable ECRC nor verify that it has
479  * already been enabled, then it must use a firmware image which works
480  * around unaligned completion packets (ethp_z8e.dat), and it should
481  * also ensure that it never gives the device a Read-DMA which is
482  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
483  * enabled, then the driver should use the aligned (eth_z8e.dat)
484  * firmware image, and set tx_boundary to 4KB.
485  */
486 static int
487 mxge_firmware_probe(mxge_softc_t *sc)
488 {
489 	device_t dev = sc->dev;
490 	int reg, status;
491 	uint16_t pectl;
492 
493 	sc->tx_boundary = 4096;
494 
495 	/*
496 	 * Verify the max read request size was set to 4KB
497 	 * before trying the test with 4KB.
498 	 */
499 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
500 		pectl = pci_read_config(dev, reg + 0x8, 2);
501 		if ((pectl & (5 << 12)) != (5 << 12)) {
502 			device_printf(dev, "Max Read Req. size != 4k (0x%x)\n",
503 			    pectl);
504 			sc->tx_boundary = 2048;
505 		}
506 	}
507 
508 	/*
509 	 * Load the optimized firmware (which assumes aligned PCIe
510 	 * completions) in order to see if it works on this host.
511 	 */
512 	sc->fw_name = mxge_fw_aligned;
513 	status = mxge_load_firmware(sc, 1);
514 	if (status != 0)
515 		return status;
516 
517 	/*
518 	 * Enable ECRC if possible
519 	 */
520 	mxge_enable_nvidia_ecrc(sc);
521 
522 	/*
523 	 * Run a DMA test which watches for unaligned completions and
524 	 * aborts on the first one seen.  Not required on Z8ES or newer.
525 	 */
526 	if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
527 		return 0;
528 
529 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
530 	if (status == 0)
531 		return 0; /* keep the aligned firmware */
532 
533 	if (status != E2BIG)
534 		device_printf(dev, "DMA test failed: %d\n", status);
535 	if (status == ENOSYS) {
536 		device_printf(dev, "Falling back to ethp! "
537 		    "Please install up to date fw\n");
538 	}
539 	return status;
540 }
541 
542 static int
543 mxge_select_firmware(mxge_softc_t *sc)
544 {
545 	int aligned = 0;
546 	int force_firmware = mxge_force_firmware;
547 
548 	if (sc->throttle)
549 		force_firmware = sc->throttle;
550 
551 	if (force_firmware != 0) {
552 		if (force_firmware == 1)
553 			aligned = 1;
554 		else
555 			aligned = 0;
556 		if (bootverbose) {
557 			device_printf(sc->dev,
558 			    "Assuming %s completions (forced)\n",
559 			    aligned ? "aligned" : "unaligned");
560 		}
561 		goto abort;
562 	}
563 
564 	/*
565 	 * If the PCIe link width is 4 or less, we can use the aligned
566 	 * firmware and skip any checks
567 	 */
568 	if (sc->link_width != 0 && sc->link_width <= 4) {
569 		device_printf(sc->dev, "PCIe x%d Link, "
570 		    "expect reduced performance\n", sc->link_width);
571 		aligned = 1;
572 		goto abort;
573 	}
574 
575 	if (mxge_firmware_probe(sc) == 0)
576 		return 0;
577 
578 abort:
579 	if (aligned) {
580 		sc->fw_name = mxge_fw_aligned;
581 		sc->tx_boundary = 4096;
582 	} else {
583 		sc->fw_name = mxge_fw_unaligned;
584 		sc->tx_boundary = 2048;
585 	}
586 	return mxge_load_firmware(sc, 0);
587 }
588 
589 static int
590 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
591 {
592 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
593 		if_printf(sc->ifp, "Bad firmware type: 0x%x\n",
594 		    be32toh(hdr->mcp_type));
595 		return EIO;
596 	}
597 
598 	/* Save firmware version for sysctl */
599 	strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
600 	if (bootverbose)
601 		if_printf(sc->ifp, "firmware id: %s\n", hdr->version);
602 
603 	ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
604 	    &sc->fw_ver_minor, &sc->fw_ver_tiny);
605 
606 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR &&
607 	      sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
608 		if_printf(sc->ifp, "Found firmware version %s\n",
609 		    sc->fw_version);
610 		if_printf(sc->ifp, "Driver needs %d.%d\n",
611 		    MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
612 		return EINVAL;
613 	}
614 	return 0;
615 }
616 
617 static void *
618 z_alloc(void *nil, u_int items, u_int size)
619 {
620 	return kmalloc(items * size, M_TEMP, M_WAITOK);
621 }
622 
623 static void
624 z_free(void *nil, void *ptr)
625 {
626 	kfree(ptr, M_TEMP);
627 }
628 
629 static int
630 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
631 {
632 	z_stream zs;
633 	char *inflate_buffer;
634 	const struct firmware *fw;
635 	const mcp_gen_header_t *hdr;
636 	unsigned hdr_offset;
637 	int status;
638 	unsigned int i;
639 	char dummy;
640 	size_t fw_len;
641 
642 	fw = firmware_get(sc->fw_name);
643 	if (fw == NULL) {
644 		if_printf(sc->ifp, "Could not find firmware image %s\n",
645 		    sc->fw_name);
646 		return ENOENT;
647 	}
648 
649 	/* Setup zlib and decompress f/w */
650 	bzero(&zs, sizeof(zs));
651 	zs.zalloc = z_alloc;
652 	zs.zfree = z_free;
653 	status = inflateInit(&zs);
654 	if (status != Z_OK) {
655 		status = EIO;
656 		goto abort_with_fw;
657 	}
658 
659 	/*
660 	 * The uncompressed size is stored as the firmware version,
661 	 * which would otherwise go unused
662 	 */
663 	fw_len = (size_t)fw->version;
664 	inflate_buffer = kmalloc(fw_len, M_TEMP, M_WAITOK);
665 	zs.avail_in = fw->datasize;
666 	zs.next_in = __DECONST(char *, fw->data);
667 	zs.avail_out = fw_len;
668 	zs.next_out = inflate_buffer;
669 	status = inflate(&zs, Z_FINISH);
670 	if (status != Z_STREAM_END) {
671 		if_printf(sc->ifp, "zlib %d\n", status);
672 		status = EIO;
673 		goto abort_with_buffer;
674 	}
675 
676 	/* Check id */
677 	hdr_offset =
678 	htobe32(*(const uint32_t *)(inflate_buffer + MCP_HEADER_PTR_OFFSET));
679 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
680 		if_printf(sc->ifp, "Bad firmware file");
681 		status = EIO;
682 		goto abort_with_buffer;
683 	}
684 	hdr = (const void*)(inflate_buffer + hdr_offset);
685 
686 	status = mxge_validate_firmware(sc, hdr);
687 	if (status != 0)
688 		goto abort_with_buffer;
689 
690 	/* Copy the inflated firmware to NIC SRAM. */
691 	for (i = 0; i < fw_len; i += 256) {
692 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i, inflate_buffer + i,
693 		    min(256U, (unsigned)(fw_len - i)));
694 		wmb();
695 		dummy = *sc->sram;
696 		wmb();
697 	}
698 
699 	*limit = fw_len;
700 	status = 0;
701 abort_with_buffer:
702 	kfree(inflate_buffer, M_TEMP);
703 	inflateEnd(&zs);
704 abort_with_fw:
705 	firmware_put(fw, FIRMWARE_UNLOAD);
706 	return status;
707 }
708 
709 /*
710  * Enable or disable periodic RDMAs from the host to make certain
711  * chipsets resend dropped PCIe messages
712  */
713 static void
714 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
715 {
716 	char buf_bytes[72];
717 	volatile uint32_t *confirm;
718 	volatile char *submit;
719 	uint32_t *buf, dma_low, dma_high;
720 	int i;
721 
722 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
723 
724 	/* Clear confirmation addr */
725 	confirm = (volatile uint32_t *)sc->cmd;
726 	*confirm = 0;
727 	wmb();
728 
729 	/*
730 	 * Send an rdma command to the PCIe engine, and wait for the
731 	 * response in the confirmation address.  The firmware should
732 	 * write a -1 there to indicate it is alive and well
733 	 */
734 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
735 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
736 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
737 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
738 	buf[2] = htobe32(0xffffffff);		/* confirm data */
739 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
740 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
741 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
742 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
743 	buf[5] = htobe32(enable);		/* enable? */
744 
745 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
746 
747 	mxge_pio_copy(submit, buf, 64);
748 	wmb();
749 	DELAY(1000);
750 	wmb();
751 	i = 0;
752 	while (*confirm != 0xffffffff && i < 20) {
753 		DELAY(1000);
754 		i++;
755 	}
756 	if (*confirm != 0xffffffff) {
757 		if_printf(sc->ifp, "dummy rdma %s failed (%p = 0x%x)",
758 		    (enable ? "enable" : "disable"), confirm, *confirm);
759 	}
760 }
761 
762 static int
763 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
764 {
765 	mcp_cmd_t *buf;
766 	char buf_bytes[sizeof(*buf) + 8];
767 	volatile mcp_cmd_response_t *response = sc->cmd;
768 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
769 	uint32_t dma_low, dma_high;
770 	int err, sleep_total = 0;
771 
772 	/* Ensure buf is aligned to 8 bytes */
773 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
774 
775 	buf->data0 = htobe32(data->data0);
776 	buf->data1 = htobe32(data->data1);
777 	buf->data2 = htobe32(data->data2);
778 	buf->cmd = htobe32(cmd);
779 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
780 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
781 
782 	buf->response_addr.low = htobe32(dma_low);
783 	buf->response_addr.high = htobe32(dma_high);
784 
785 	response->result = 0xffffffff;
786 	wmb();
787 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
788 
789 	/*
790 	 * Wait up to 20ms
791 	 */
792 	err = EAGAIN;
793 	for (sleep_total = 0; sleep_total < 20; sleep_total++) {
794 		wmb();
795 		switch (be32toh(response->result)) {
796 		case 0:
797 			data->data0 = be32toh(response->data);
798 			err = 0;
799 			break;
800 		case 0xffffffff:
801 			DELAY(1000);
802 			break;
803 		case MXGEFW_CMD_UNKNOWN:
804 			err = ENOSYS;
805 			break;
806 		case MXGEFW_CMD_ERROR_UNALIGNED:
807 			err = E2BIG;
808 			break;
809 		case MXGEFW_CMD_ERROR_BUSY:
810 			err = EBUSY;
811 			break;
812 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
813 			err = ENXIO;
814 			break;
815 		default:
816 			if_printf(sc->ifp, "command %d failed, result = %d\n",
817 			    cmd, be32toh(response->result));
818 			err = ENXIO;
819 			break;
820 		}
821 		if (err != EAGAIN)
822 			break;
823 	}
824 	if (err == EAGAIN) {
825 		if_printf(sc->ifp, "command %d timed out result = %d\n",
826 		    cmd, be32toh(response->result));
827 	}
828 	return err;
829 }
830 
831 static int
832 mxge_adopt_running_firmware(mxge_softc_t *sc)
833 {
834 	struct mcp_gen_header *hdr;
835 	const size_t bytes = sizeof(struct mcp_gen_header);
836 	size_t hdr_offset;
837 	int status;
838 
839 	/*
840 	 * Find running firmware header
841 	 */
842 	hdr_offset =
843 	htobe32(*(volatile uint32_t *)(sc->sram + MCP_HEADER_PTR_OFFSET));
844 
845 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
846 		if_printf(sc->ifp, "Running firmware has bad header offset "
847 		    "(%zu)\n", hdr_offset);
848 		return EIO;
849 	}
850 
851 	/*
852 	 * Copy header of running firmware from SRAM to host memory to
853 	 * validate firmware
854 	 */
855 	hdr = kmalloc(bytes, M_DEVBUF, M_WAITOK);
856 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
857 	    rman_get_bushandle(sc->mem_res), hdr_offset, (char *)hdr, bytes);
858 	status = mxge_validate_firmware(sc, hdr);
859 	kfree(hdr, M_DEVBUF);
860 
861 	/*
862 	 * Check to see if adopted firmware has bug where adopting
863 	 * it will cause broadcasts to be filtered unless the NIC
864 	 * is kept in ALLMULTI mode
865 	 */
866 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
867 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
868 		sc->adopted_rx_filter_bug = 1;
869 		if_printf(sc->ifp, "Adopting fw %d.%d.%d: "
870 		    "working around rx filter bug\n",
871 		    sc->fw_ver_major, sc->fw_ver_minor, sc->fw_ver_tiny);
872 	}
873 
874 	return status;
875 }
876 
877 static int
878 mxge_load_firmware(mxge_softc_t *sc, int adopt)
879 {
880 	volatile uint32_t *confirm;
881 	volatile char *submit;
882 	char buf_bytes[72];
883 	uint32_t *buf, size, dma_low, dma_high;
884 	int status, i;
885 
886 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
887 
888 	size = sc->sram_size;
889 	status = mxge_load_firmware_helper(sc, &size);
890 	if (status) {
891 		if (!adopt)
892 			return status;
893 
894 		/*
895 		 * Try to use the currently running firmware, if
896 		 * it is new enough
897 		 */
898 		status = mxge_adopt_running_firmware(sc);
899 		if (status) {
900 			if_printf(sc->ifp,
901 			    "failed to adopt running firmware\n");
902 			return status;
903 		}
904 		if_printf(sc->ifp, "Successfully adopted running firmware\n");
905 
906 		if (sc->tx_boundary == 4096) {
907 			if_printf(sc->ifp,
908 			     "Using firmware currently running on NIC.  "
909 			     "For optimal\n");
910 			if_printf(sc->ifp, "performance consider loading "
911 			     "optimized firmware\n");
912 		}
913 		sc->fw_name = mxge_fw_unaligned;
914 		sc->tx_boundary = 2048;
915 		return 0;
916 	}
917 
918 	/* Clear confirmation addr */
919 	confirm = (volatile uint32_t *)sc->cmd;
920 	*confirm = 0;
921 	wmb();
922 
923 	/*
924 	 * Send a reload command to the bootstrap MCP, and wait for the
925 	 * response in the confirmation address.  The firmware should
926 	 * write a -1 there to indicate it is alive and well
927 	 */
928 
929 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
930 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
931 
932 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
933 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
934 	buf[2] = htobe32(0xffffffff);	/* confirm data */
935 
936 	/*
937 	 * FIX: All newest firmware should un-protect the bottom of
938 	 * the sram before handoff. However, the very first interfaces
939 	 * do not. Therefore the handoff copy must skip the first 8 bytes
940 	 */
941 					/* where the code starts*/
942 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
943 	buf[4] = htobe32(size - 8); 	/* length of code */
944 	buf[5] = htobe32(8);		/* where to copy to */
945 	buf[6] = htobe32(0);		/* where to jump to */
946 
947 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
948 	mxge_pio_copy(submit, buf, 64);
949 	wmb();
950 	DELAY(1000);
951 	wmb();
952 	i = 0;
953 	while (*confirm != 0xffffffff && i < 20) {
954 		DELAY(1000*10);
955 		i++;
956 	}
957 	if (*confirm != 0xffffffff) {
958 		if_printf(sc->ifp,"handoff failed (%p = 0x%x)",
959 		    confirm, *confirm);
960 		return ENXIO;
961 	}
962 	return 0;
963 }
964 
965 static int
966 mxge_update_mac_address(mxge_softc_t *sc)
967 {
968 	mxge_cmd_t cmd;
969 	uint8_t *addr = sc->mac_addr;
970 
971 	cmd.data0 = (addr[0] << 24) | (addr[1] << 16) |
972 	    (addr[2] << 8) | addr[3];
973 	cmd.data1 = (addr[4] << 8) | (addr[5]);
974 	return mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
975 }
976 
977 static int
978 mxge_change_pause(mxge_softc_t *sc, int pause)
979 {
980 	mxge_cmd_t cmd;
981 	int status;
982 
983 	if (pause)
984 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL, &cmd);
985 	else
986 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL, &cmd);
987 	if (status) {
988 		if_printf(sc->ifp, "Failed to set flow control mode\n");
989 		return ENXIO;
990 	}
991 	sc->pause = pause;
992 	return 0;
993 }
994 
995 static void
996 mxge_change_promisc(mxge_softc_t *sc, int promisc)
997 {
998 	mxge_cmd_t cmd;
999 	int status;
1000 
1001 	if (mxge_always_promisc)
1002 		promisc = 1;
1003 
1004 	if (promisc)
1005 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC, &cmd);
1006 	else
1007 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC, &cmd);
1008 	if (status)
1009 		if_printf(sc->ifp, "Failed to set promisc mode\n");
1010 }
1011 
1012 static void
1013 mxge_set_multicast_list(mxge_softc_t *sc)
1014 {
1015 	mxge_cmd_t cmd;
1016 	struct ifmultiaddr *ifma;
1017 	struct ifnet *ifp = sc->ifp;
1018 	int err;
1019 
1020 	/* This firmware is known to not support multicast */
1021 	if (!sc->fw_multicast_support)
1022 		return;
1023 
1024 	/* Disable multicast filtering while we play with the lists*/
1025 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1026 	if (err != 0) {
1027 		if_printf(ifp, "Failed MXGEFW_ENABLE_ALLMULTI, "
1028 		    "error status: %d\n", err);
1029 		return;
1030 	}
1031 
1032 	if (sc->adopted_rx_filter_bug)
1033 		return;
1034 
1035 	if (ifp->if_flags & IFF_ALLMULTI) {
1036 		/* Request to disable multicast filtering, so quit here */
1037 		return;
1038 	}
1039 
1040 	/* Flush all the filters */
1041 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1042 	if (err != 0) {
1043 		if_printf(ifp, "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, "
1044 		    "error status: %d\n", err);
1045 		return;
1046 	}
1047 
1048 	/*
1049 	 * Walk the multicast list, and add each address
1050 	 */
1051 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1052 		if (ifma->ifma_addr->sa_family != AF_LINK)
1053 			continue;
1054 
1055 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1056 		    &cmd.data0, 4);
1057 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1058 		    &cmd.data1, 2);
1059 		cmd.data0 = htonl(cmd.data0);
1060 		cmd.data1 = htonl(cmd.data1);
1061 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1062 		if (err != 0) {
1063 			if_printf(ifp, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1064 			    "error status: %d\n", err);
1065 			/* Abort, leaving multicast filtering off */
1066 			return;
1067 		}
1068 	}
1069 
1070 	/* Enable multicast filtering */
1071 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1072 	if (err != 0) {
1073 		if_printf(ifp, "Failed MXGEFW_DISABLE_ALLMULTI, "
1074 		    "error status: %d\n", err);
1075 	}
1076 }
1077 
1078 #if 0
1079 static int
1080 mxge_max_mtu(mxge_softc_t *sc)
1081 {
1082 	mxge_cmd_t cmd;
1083 	int status;
1084 
1085 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1086 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1087 
1088 	/* try to set nbufs to see if it we can
1089 	   use virtually contiguous jumbos */
1090 	cmd.data0 = 0;
1091 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1092 			       &cmd);
1093 	if (status == 0)
1094 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1095 
1096 	/* otherwise, we're limited to MJUMPAGESIZE */
1097 	return MJUMPAGESIZE - MXGEFW_PAD;
1098 }
1099 #endif
1100 
1101 static int
1102 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1103 {
1104 	struct mxge_slice_state *ss;
1105 	mxge_rx_done_t *rx_done;
1106 	volatile uint32_t *irq_claim;
1107 	mxge_cmd_t cmd;
1108 	int slice, status, rx_intr_size;
1109 
1110 	/*
1111 	 * Try to send a reset command to the card to see if it
1112 	 * is alive
1113 	 */
1114 	memset(&cmd, 0, sizeof (cmd));
1115 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1116 	if (status != 0) {
1117 		if_printf(sc->ifp, "failed reset\n");
1118 		return ENXIO;
1119 	}
1120 
1121 	mxge_dummy_rdma(sc, 1);
1122 
1123 	/*
1124 	 * Set the intrq size
1125 	 * XXX assume 4byte mcp_slot
1126 	 */
1127 	rx_intr_size = sc->rx_intr_slots * sizeof(mcp_slot_t);
1128 	cmd.data0 = rx_intr_size;
1129 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1130 
1131 	/*
1132 	 * Even though we already know how many slices are supported
1133 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1134 	 * has magic side effects, and must be called after a reset.
1135 	 * It must be called prior to calling any RSS related cmds,
1136 	 * including assigning an interrupt queue for anything but
1137 	 * slice 0.  It must also be called *after*
1138 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1139 	 * the firmware to compute offsets.
1140 	 */
1141 	if (sc->num_slices > 1) {
1142 		/* Ask the maximum number of slices it supports */
1143 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
1144 		if (status != 0) {
1145 			if_printf(sc->ifp, "failed to get number of slices\n");
1146 			return status;
1147 		}
1148 
1149 		/*
1150 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1151 		 * to setting up the interrupt queue DMA
1152 		 */
1153 		cmd.data0 = sc->num_slices;
1154 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1155 #ifdef IFNET_BUF_RING
1156 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1157 #endif
1158 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES, &cmd);
1159 		if (status != 0) {
1160 			if_printf(sc->ifp, "failed to set number of slices\n");
1161 			return status;
1162 		}
1163 	}
1164 
1165 	if (interrupts_setup) {
1166 		/* Now exchange information about interrupts  */
1167 		for (slice = 0; slice < sc->num_slices; slice++) {
1168 			ss = &sc->ss[slice];
1169 
1170 			rx_done = &ss->rx_data.rx_done;
1171 			memset(rx_done->entry, 0, rx_intr_size);
1172 
1173 			cmd.data0 =
1174 			    MXGE_LOWPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1175 			cmd.data1 =
1176 			    MXGE_HIGHPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1177 			cmd.data2 = slice;
1178 			status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA,
1179 			    &cmd);
1180 		}
1181 	}
1182 
1183 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET,
1184 	    &cmd);
1185 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1186 
1187 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1188 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1189 
1190 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1191 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1192 
1193 	if (status != 0) {
1194 		if_printf(sc->ifp, "failed set interrupt parameters\n");
1195 		return status;
1196 	}
1197 
1198 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1199 
1200 	/* Run a DMA benchmark */
1201 	mxge_dma_test(sc, MXGEFW_DMA_TEST);
1202 
1203 	for (slice = 0; slice < sc->num_slices; slice++) {
1204 		ss = &sc->ss[slice];
1205 
1206 		ss->irq_claim = irq_claim + (2 * slice);
1207 
1208 		/* Reset mcp/driver shared state back to 0 */
1209 		ss->rx_data.rx_done.idx = 0;
1210 		ss->tx.req = 0;
1211 		ss->tx.done = 0;
1212 		ss->tx.pkt_done = 0;
1213 		ss->tx.queue_active = 0;
1214 		ss->tx.activate = 0;
1215 		ss->tx.deactivate = 0;
1216 		ss->rx_data.rx_big.cnt = 0;
1217 		ss->rx_data.rx_small.cnt = 0;
1218 		if (ss->fw_stats != NULL)
1219 			bzero(ss->fw_stats, sizeof(*ss->fw_stats));
1220 	}
1221 	sc->rdma_tags_available = 15;
1222 
1223 	status = mxge_update_mac_address(sc);
1224 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1225 	mxge_change_pause(sc, sc->pause);
1226 	mxge_set_multicast_list(sc);
1227 
1228 	if (sc->throttle) {
1229 		cmd.data0 = sc->throttle;
1230 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd))
1231 			if_printf(sc->ifp, "can't enable throttle\n");
1232 	}
1233 	return status;
1234 }
1235 
1236 static int
1237 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1238 {
1239 	mxge_cmd_t cmd;
1240 	mxge_softc_t *sc;
1241 	int err;
1242 	unsigned int throttle;
1243 
1244 	sc = arg1;
1245 	throttle = sc->throttle;
1246 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1247 	if (err != 0)
1248 		return err;
1249 
1250 	if (throttle == sc->throttle)
1251 		return 0;
1252 
1253 	if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1254 		return EINVAL;
1255 
1256 	ifnet_serialize_all(sc->ifp);
1257 
1258 	cmd.data0 = throttle;
1259 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1260 	if (err == 0)
1261 		sc->throttle = throttle;
1262 
1263 	ifnet_deserialize_all(sc->ifp);
1264 	return err;
1265 }
1266 
1267 static int
1268 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1269 {
1270 	mxge_softc_t *sc;
1271 	unsigned int intr_coal_delay;
1272 	int err;
1273 
1274 	sc = arg1;
1275 	intr_coal_delay = sc->intr_coal_delay;
1276 	err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1277 	if (err != 0)
1278 		return err;
1279 
1280 	if (intr_coal_delay == sc->intr_coal_delay)
1281 		return 0;
1282 
1283 	if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1284 		return EINVAL;
1285 
1286 	ifnet_serialize_all(sc->ifp);
1287 
1288 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1289 	sc->intr_coal_delay = intr_coal_delay;
1290 
1291 	ifnet_deserialize_all(sc->ifp);
1292 	return err;
1293 }
1294 
1295 static int
1296 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1297 {
1298 	mxge_softc_t *sc;
1299 	unsigned int enabled;
1300 	int err;
1301 
1302 	sc = arg1;
1303 	enabled = sc->pause;
1304 	err = sysctl_handle_int(oidp, &enabled, arg2, req);
1305 	if (err != 0)
1306 		return err;
1307 
1308 	if (enabled == sc->pause)
1309 		return 0;
1310 
1311 	ifnet_serialize_all(sc->ifp);
1312 	err = mxge_change_pause(sc, enabled);
1313 	ifnet_deserialize_all(sc->ifp);
1314 
1315 	return err;
1316 }
1317 
1318 static int
1319 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1320 {
1321 	int err;
1322 
1323 	if (arg1 == NULL)
1324 		return EFAULT;
1325 	arg2 = be32toh(*(int *)arg1);
1326 	arg1 = NULL;
1327 	err = sysctl_handle_int(oidp, arg1, arg2, req);
1328 
1329 	return err;
1330 }
1331 
1332 static void
1333 mxge_rem_sysctls(mxge_softc_t *sc)
1334 {
1335 	if (sc->ss != NULL) {
1336 		struct mxge_slice_state *ss;
1337 		int slice;
1338 
1339 		for (slice = 0; slice < sc->num_slices; slice++) {
1340 			ss = &sc->ss[slice];
1341 			if (ss->sysctl_tree != NULL) {
1342 				sysctl_ctx_free(&ss->sysctl_ctx);
1343 				ss->sysctl_tree = NULL;
1344 			}
1345 		}
1346 	}
1347 
1348 	if (sc->slice_sysctl_tree != NULL) {
1349 		sysctl_ctx_free(&sc->slice_sysctl_ctx);
1350 		sc->slice_sysctl_tree = NULL;
1351 	}
1352 
1353 	if (sc->sysctl_tree != NULL) {
1354 		sysctl_ctx_free(&sc->sysctl_ctx);
1355 		sc->sysctl_tree = NULL;
1356 	}
1357 }
1358 
1359 static void
1360 mxge_add_sysctls(mxge_softc_t *sc)
1361 {
1362 	struct sysctl_ctx_list *ctx;
1363 	struct sysctl_oid_list *children;
1364 	mcp_irq_data_t *fw;
1365 	struct mxge_slice_state *ss;
1366 	int slice;
1367 	char slice_num[8];
1368 
1369 	ctx = &sc->sysctl_ctx;
1370 	sysctl_ctx_init(ctx);
1371 	sc->sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_STATIC_CHILDREN(_hw),
1372 	    OID_AUTO, device_get_nameunit(sc->dev), CTLFLAG_RD, 0, "");
1373 	if (sc->sysctl_tree == NULL) {
1374 		device_printf(sc->dev, "can't add sysctl node\n");
1375 		return;
1376 	}
1377 
1378 	children = SYSCTL_CHILDREN(sc->sysctl_tree);
1379 	fw = sc->ss[0].fw_stats;
1380 
1381 	/*
1382 	 * Random information
1383 	 */
1384 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
1385 	    CTLFLAG_RD, &sc->fw_version, 0, "firmware version");
1386 
1387 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "serial_number",
1388 	    CTLFLAG_RD, &sc->serial_number_string, 0, "serial number");
1389 
1390 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "product_code",
1391 	    CTLFLAG_RD, &sc->product_code_string, 0, "product code");
1392 
1393 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcie_link_width",
1394 	    CTLFLAG_RD, &sc->link_width, 0, "link width");
1395 
1396 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_boundary",
1397 	    CTLFLAG_RD, &sc->tx_boundary, 0, "tx boundary");
1398 
1399 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_combine",
1400 	    CTLFLAG_RD, &sc->wc, 0, "write combining PIO");
1401 
1402 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_dma_MBs",
1403 	    CTLFLAG_RD, &sc->read_dma, 0, "DMA Read speed in MB/s");
1404 
1405 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_dma_MBs",
1406 	    CTLFLAG_RD, &sc->write_dma, 0, "DMA Write speed in MB/s");
1407 
1408 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_write_dma_MBs",
1409 	    CTLFLAG_RD, &sc->read_write_dma, 0,
1410 	    "DMA concurrent Read/Write speed in MB/s");
1411 
1412 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "watchdog_resets",
1413 	    CTLFLAG_RD, &sc->watchdog_resets, 0,
1414 	    "Number of times NIC was reset");
1415 
1416 	/*
1417 	 * Performance related tunables
1418 	 */
1419 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal_delay",
1420 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_intr_coal, "I",
1421 	    "Interrupt coalescing delay in usecs");
1422 
1423 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "throttle",
1424 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_throttle, "I",
1425 	    "Transmit throttling");
1426 
1427 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "flow_control_enabled",
1428 	    CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_flow_control, "I",
1429 	    "Interrupt coalescing delay in usecs");
1430 
1431 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "deassert_wait",
1432 	    CTLFLAG_RW, &mxge_deassert_wait, 0,
1433 	    "Wait for IRQ line to go low in ihandler");
1434 
1435 	/*
1436 	 * Stats block from firmware is in network byte order.
1437 	 * Need to swap it
1438 	 */
1439 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "link_up",
1440 	    CTLTYPE_INT|CTLFLAG_RD, &fw->link_up, 0,
1441 	    mxge_handle_be32, "I", "link up");
1442 
1443 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_tags_available",
1444 	    CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available, 0,
1445 	    mxge_handle_be32, "I", "rdma_tags_available");
1446 
1447 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_crc32",
1448 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_crc32, 0,
1449 	    mxge_handle_be32, "I", "dropped_bad_crc32");
1450 
1451 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_phy",
1452 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_phy, 0,
1453 	    mxge_handle_be32, "I", "dropped_bad_phy");
1454 
1455 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_error_or_filtered",
1456 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_error_or_filtered, 0,
1457 	    mxge_handle_be32, "I", "dropped_link_error_or_filtered");
1458 
1459 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_overflow",
1460 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow, 0,
1461 	    mxge_handle_be32, "I", "dropped_link_overflow");
1462 
1463 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_multicast_filtered",
1464 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_multicast_filtered, 0,
1465 	    mxge_handle_be32, "I", "dropped_multicast_filtered");
1466 
1467 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_big_buffer",
1468 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer, 0,
1469 	    mxge_handle_be32, "I", "dropped_no_big_buffer");
1470 
1471 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_small_buffer",
1472 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_small_buffer, 0,
1473 	    mxge_handle_be32, "I", "dropped_no_small_buffer");
1474 
1475 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_overrun",
1476 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun, 0,
1477 	    mxge_handle_be32, "I", "dropped_overrun");
1478 
1479 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_pause",
1480 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_pause, 0,
1481 	    mxge_handle_be32, "I", "dropped_pause");
1482 
1483 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_runt",
1484 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt, 0,
1485 	    mxge_handle_be32, "I", "dropped_runt");
1486 
1487 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_unicast_filtered",
1488 	    CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered, 0,
1489 	    mxge_handle_be32, "I", "dropped_unicast_filtered");
1490 
1491 	/* add counters exported for debugging from all slices */
1492 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1493 	sc->slice_sysctl_tree = SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx,
1494 	    children, OID_AUTO, "slice", CTLFLAG_RD, 0, "");
1495 	if (sc->slice_sysctl_tree == NULL) {
1496 		device_printf(sc->dev, "can't add slice sysctl node\n");
1497 		return;
1498 	}
1499 
1500 	for (slice = 0; slice < sc->num_slices; slice++) {
1501 		ss = &sc->ss[slice];
1502 		sysctl_ctx_init(&ss->sysctl_ctx);
1503 		ctx = &ss->sysctl_ctx;
1504 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1505 		ksprintf(slice_num, "%d", slice);
1506 		ss->sysctl_tree = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
1507 		    slice_num, CTLFLAG_RD, 0, "");
1508 		if (ss->sysctl_tree == NULL) {
1509 			device_printf(sc->dev,
1510 			    "can't add %d slice sysctl node\n", slice);
1511 			return;	/* XXX continue? */
1512 		}
1513 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1514 
1515 		/*
1516 		 * XXX change to ULONG
1517 		 */
1518 
1519 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_small_cnt",
1520 		    CTLFLAG_RD, &ss->rx_data.rx_small.cnt, 0, "rx_small_cnt");
1521 
1522 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_big_cnt",
1523 		    CTLFLAG_RD, &ss->rx_data.rx_big.cnt, 0, "rx_small_cnt");
1524 
1525 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_req",
1526 		    CTLFLAG_RD, &ss->tx.req, 0, "tx_req");
1527 
1528 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_done",
1529 		    CTLFLAG_RD, &ss->tx.done, 0, "tx_done");
1530 
1531 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_pkt_done",
1532 		    CTLFLAG_RD, &ss->tx.pkt_done, 0, "tx_done");
1533 
1534 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_queue_active",
1535 		    CTLFLAG_RD, &ss->tx.queue_active, 0, "tx_queue_active");
1536 
1537 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_activate",
1538 		    CTLFLAG_RD, &ss->tx.activate, 0, "tx_activate");
1539 
1540 		SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_deactivate",
1541 		    CTLFLAG_RD, &ss->tx.deactivate, 0, "tx_deactivate");
1542 	}
1543 }
1544 
1545 /*
1546  * Copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1547  * backwards one at a time and handle ring wraps
1548  */
1549 static __inline void
1550 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1551     mcp_kreq_ether_send_t *src, int cnt)
1552 {
1553 	int idx, starting_slot;
1554 
1555 	starting_slot = tx->req;
1556 	while (cnt > 1) {
1557 		cnt--;
1558 		idx = (starting_slot + cnt) & tx->mask;
1559 		mxge_pio_copy(&tx->lanai[idx], &src[cnt], sizeof(*src));
1560 		wmb();
1561 	}
1562 }
1563 
1564 /*
1565  * Copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1566  * at most 32 bytes at a time, so as to avoid involving the software
1567  * pio handler in the nic.  We re-write the first segment's flags
1568  * to mark them valid only after writing the entire chain
1569  */
1570 static __inline void
1571 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1572 {
1573 	int idx, i;
1574 	uint32_t *src_ints;
1575 	volatile uint32_t *dst_ints;
1576 	mcp_kreq_ether_send_t *srcp;
1577 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1578 	uint8_t last_flags;
1579 
1580 	idx = tx->req & tx->mask;
1581 
1582 	last_flags = src->flags;
1583 	src->flags = 0;
1584 	wmb();
1585 	dst = dstp = &tx->lanai[idx];
1586 	srcp = src;
1587 
1588 	if ((idx + cnt) < tx->mask) {
1589 		for (i = 0; i < cnt - 1; i += 2) {
1590 			mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1591 			wmb(); /* force write every 32 bytes */
1592 			srcp += 2;
1593 			dstp += 2;
1594 		}
1595 	} else {
1596 		/*
1597 		 * Submit all but the first request, and ensure
1598 		 * that it is submitted below
1599 		 */
1600 		mxge_submit_req_backwards(tx, src, cnt);
1601 		i = 0;
1602 	}
1603 	if (i < cnt) {
1604 		/* Submit the first request */
1605 		mxge_pio_copy(dstp, srcp, sizeof(*src));
1606 		wmb(); /* barrier before setting valid flag */
1607 	}
1608 
1609 	/* Re-write the last 32-bits with the valid flags */
1610 	src->flags = last_flags;
1611 	src_ints = (uint32_t *)src;
1612 	src_ints+=3;
1613 	dst_ints = (volatile uint32_t *)dst;
1614 	dst_ints+=3;
1615 	*dst_ints = *src_ints;
1616 	tx->req += cnt;
1617 	wmb();
1618 }
1619 
1620 static int
1621 mxge_pullup_tso(struct mbuf **mp)
1622 {
1623 	int hoff, iphlen, thoff;
1624 	struct mbuf *m;
1625 
1626 	m = *mp;
1627 	KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
1628 
1629 	iphlen = m->m_pkthdr.csum_iphlen;
1630 	thoff = m->m_pkthdr.csum_thlen;
1631 	hoff = m->m_pkthdr.csum_lhlen;
1632 
1633 	KASSERT(iphlen > 0, ("invalid ip hlen"));
1634 	KASSERT(thoff > 0, ("invalid tcp hlen"));
1635 	KASSERT(hoff > 0, ("invalid ether hlen"));
1636 
1637 	if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
1638 		m = m_pullup(m, hoff + iphlen + thoff);
1639 		if (m == NULL) {
1640 			*mp = NULL;
1641 			return ENOBUFS;
1642 		}
1643 		*mp = m;
1644 	}
1645 	return 0;
1646 }
1647 
1648 static int
1649 mxge_encap_tso(mxge_tx_ring_t *tx, struct mxge_buffer_state *info_map,
1650     struct mbuf *m, int busdma_seg_cnt)
1651 {
1652 	mcp_kreq_ether_send_t *req;
1653 	bus_dma_segment_t *seg;
1654 	uint32_t low, high_swapped;
1655 	int len, seglen, cum_len, cum_len_next;
1656 	int next_is_first, chop, cnt, rdma_count, small;
1657 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1658 	uint8_t flags, flags_next;
1659 	struct mxge_buffer_state *info_last;
1660 	bus_dmamap_t map = info_map->map;
1661 
1662 	mss = m->m_pkthdr.tso_segsz;
1663 
1664 	/*
1665 	 * Negative cum_len signifies to the send loop that we are
1666 	 * still in the header portion of the TSO packet.
1667 	 */
1668 	cum_len = -(m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen +
1669 	    m->m_pkthdr.csum_thlen);
1670 
1671 	/*
1672 	 * TSO implies checksum offload on this hardware
1673 	 */
1674 	cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1675 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1676 
1677 	/*
1678 	 * For TSO, pseudo_hdr_offset holds mss.  The firmware figures
1679 	 * out where to put the checksum by parsing the header.
1680 	 */
1681 	pseudo_hdr_offset = htobe16(mss);
1682 
1683 	req = tx->req_list;
1684 	seg = tx->seg_list;
1685 	cnt = 0;
1686 	rdma_count = 0;
1687 
1688 	/*
1689 	 * "rdma_count" is the number of RDMAs belonging to the current
1690 	 * packet BEFORE the current send request.  For non-TSO packets,
1691 	 * this is equal to "count".
1692 	 *
1693 	 * For TSO packets, rdma_count needs to be reset to 0 after a
1694 	 * segment cut.
1695 	 *
1696 	 * The rdma_count field of the send request is the number of
1697 	 * RDMAs of the packet starting at that request.  For TSO send
1698 	 * requests with one ore more cuts in the middle, this is the
1699 	 * number of RDMAs starting after the last cut in the request.
1700 	 * All previous segments before the last cut implicitly have 1
1701 	 * RDMA.
1702 	 *
1703 	 * Since the number of RDMAs is not known beforehand, it must be
1704 	 * filled-in retroactively - after each segmentation cut or at
1705 	 * the end of the entire packet.
1706 	 */
1707 
1708 	while (busdma_seg_cnt) {
1709 		/*
1710 		 * Break the busdma segment up into pieces
1711 		 */
1712 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1713 		high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1714 		len = seg->ds_len;
1715 
1716 		while (len) {
1717 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1718 			seglen = len;
1719 			cum_len_next = cum_len + seglen;
1720 			(req - rdma_count)->rdma_count = rdma_count + 1;
1721 			if (__predict_true(cum_len >= 0)) {
1722 				/* Payload */
1723 				chop = (cum_len_next > mss);
1724 				cum_len_next = cum_len_next % mss;
1725 				next_is_first = (cum_len_next == 0);
1726 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1727 				flags_next |=
1728 				    next_is_first * MXGEFW_FLAGS_FIRST;
1729 				rdma_count |= -(chop | next_is_first);
1730 				rdma_count += chop & !next_is_first;
1731 			} else if (cum_len_next >= 0) {
1732 				/* Header ends */
1733 				rdma_count = -1;
1734 				cum_len_next = 0;
1735 				seglen = -cum_len;
1736 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1737 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1738 				    MXGEFW_FLAGS_FIRST |
1739 				    (small * MXGEFW_FLAGS_SMALL);
1740 			}
1741 
1742 			req->addr_high = high_swapped;
1743 			req->addr_low = htobe32(low);
1744 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1745 			req->pad = 0;
1746 			req->rdma_count = 1;
1747 			req->length = htobe16(seglen);
1748 			req->cksum_offset = cksum_offset;
1749 			req->flags =
1750 			    flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD);
1751 			low += seglen;
1752 			len -= seglen;
1753 			cum_len = cum_len_next;
1754 			flags = flags_next;
1755 			req++;
1756 			cnt++;
1757 			rdma_count++;
1758 			if (__predict_false(cksum_offset > seglen))
1759 				cksum_offset -= seglen;
1760 			else
1761 				cksum_offset = 0;
1762 			if (__predict_false(cnt > tx->max_desc))
1763 				goto drop;
1764 		}
1765 		busdma_seg_cnt--;
1766 		seg++;
1767 	}
1768 	(req - rdma_count)->rdma_count = rdma_count;
1769 
1770 	do {
1771 		req--;
1772 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1773 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1774 
1775 	info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1776 
1777 	info_map->map = info_last->map;
1778 	info_last->map = map;
1779 	info_last->m = m;
1780 
1781 	mxge_submit_req(tx, tx->req_list, cnt);
1782 #ifdef IFNET_BUF_RING
1783 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1784 		/* tell the NIC to start polling this slice */
1785 		*tx->send_go = 1;
1786 		tx->queue_active = 1;
1787 		tx->activate++;
1788 		wmb();
1789 	}
1790 #endif
1791 	return 0;
1792 
1793 drop:
1794 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1795 	m_freem(m);
1796 	return ENOBUFS;
1797 }
1798 
1799 static int
1800 mxge_encap(mxge_tx_ring_t *tx, struct mbuf *m, bus_addr_t zeropad)
1801 {
1802 	mcp_kreq_ether_send_t *req;
1803 	bus_dma_segment_t *seg;
1804 	bus_dmamap_t map;
1805 	int cnt, cum_len, err, i, idx, odd_flag;
1806 	uint16_t pseudo_hdr_offset;
1807 	uint8_t flags, cksum_offset;
1808 	struct mxge_buffer_state *info_map, *info_last;
1809 
1810 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1811 		err = mxge_pullup_tso(&m);
1812 		if (__predict_false(err))
1813 			return err;
1814 	}
1815 
1816 	/*
1817 	 * Map the frame for DMA
1818 	 */
1819 	idx = tx->req & tx->mask;
1820 	info_map = &tx->info[idx];
1821 	map = info_map->map;
1822 
1823 	err = bus_dmamap_load_mbuf_defrag(tx->dmat, map, &m,
1824 	    tx->seg_list, tx->max_desc - 2, &cnt, BUS_DMA_NOWAIT);
1825 	if (__predict_false(err != 0))
1826 		goto drop;
1827 	bus_dmamap_sync(tx->dmat, map, BUS_DMASYNC_PREWRITE);
1828 
1829 	/*
1830 	 * TSO is different enough, we handle it in another routine
1831 	 */
1832 	if (m->m_pkthdr.csum_flags & CSUM_TSO)
1833 		return mxge_encap_tso(tx, info_map, m, cnt);
1834 
1835 	req = tx->req_list;
1836 	cksum_offset = 0;
1837 	pseudo_hdr_offset = 0;
1838 	flags = MXGEFW_FLAGS_NO_TSO;
1839 
1840 	/*
1841 	 * Checksum offloading
1842 	 */
1843 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1844 		cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1845 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1846 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1847 		req->cksum_offset = cksum_offset;
1848 		flags |= MXGEFW_FLAGS_CKSUM;
1849 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1850 	} else {
1851 		odd_flag = 0;
1852 	}
1853 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1854 		flags |= MXGEFW_FLAGS_SMALL;
1855 
1856 	/*
1857 	 * Convert segments into a request list
1858 	 */
1859 	cum_len = 0;
1860 	seg = tx->seg_list;
1861 	req->flags = MXGEFW_FLAGS_FIRST;
1862 	for (i = 0; i < cnt; i++) {
1863 		req->addr_low = htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1864 		req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1865 		req->length = htobe16(seg->ds_len);
1866 		req->cksum_offset = cksum_offset;
1867 		if (cksum_offset > seg->ds_len)
1868 			cksum_offset -= seg->ds_len;
1869 		else
1870 			cksum_offset = 0;
1871 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1872 		req->pad = 0; /* complete solid 16-byte block */
1873 		req->rdma_count = 1;
1874 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1875 		cum_len += seg->ds_len;
1876 		seg++;
1877 		req++;
1878 		req->flags = 0;
1879 	}
1880 	req--;
1881 
1882 	/*
1883 	 * Pad runt to 60 bytes
1884 	 */
1885 	if (cum_len < 60) {
1886 		req++;
1887 		req->addr_low = htobe32(MXGE_LOWPART_TO_U32(zeropad));
1888 		req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(zeropad));
1889 		req->length = htobe16(60 - cum_len);
1890 		req->cksum_offset = 0;
1891 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1892 		req->pad = 0; /* complete solid 16-byte block */
1893 		req->rdma_count = 1;
1894 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1895 		cnt++;
1896 	}
1897 
1898 	tx->req_list[0].rdma_count = cnt;
1899 #if 0
1900 	/* print what the firmware will see */
1901 	for (i = 0; i < cnt; i++) {
1902 		kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1903 		    "cso:%d, flags:0x%x, rdma:%d\n",
1904 		    i, (int)ntohl(tx->req_list[i].addr_high),
1905 		    (int)ntohl(tx->req_list[i].addr_low),
1906 		    (int)ntohs(tx->req_list[i].length),
1907 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1908 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1909 		    tx->req_list[i].rdma_count);
1910 	}
1911 	kprintf("--------------\n");
1912 #endif
1913 	info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1914 
1915 	info_map->map = info_last->map;
1916 	info_last->map = map;
1917 	info_last->m = m;
1918 
1919 	mxge_submit_req(tx, tx->req_list, cnt);
1920 #ifdef IFNET_BUF_RING
1921 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1922 		/* tell the NIC to start polling this slice */
1923 		*tx->send_go = 1;
1924 		tx->queue_active = 1;
1925 		tx->activate++;
1926 		wmb();
1927 	}
1928 #endif
1929 	return 0;
1930 
1931 drop:
1932 	m_freem(m);
1933 	return err;
1934 }
1935 
1936 static void
1937 mxge_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
1938 {
1939 	mxge_softc_t *sc = ifp->if_softc;
1940 	mxge_tx_ring_t *tx;
1941 	bus_addr_t zeropad;
1942 	int encap = 0;
1943 
1944 	/* XXX Only use the first slice for now */
1945 	tx = &sc->ss[0].tx;
1946 
1947 	ASSERT_ALTQ_SQ_DEFAULT(ifp, ifsq);
1948 	ASSERT_SERIALIZED(&tx->tx_serialize);
1949 
1950 	if ((ifp->if_flags & IFF_RUNNING) == 0 || ifsq_is_oactive(ifsq))
1951 		return;
1952 
1953 	zeropad = sc->zeropad_dma.dmem_busaddr;
1954 	while (tx->mask - (tx->req - tx->done) > tx->max_desc) {
1955 		struct mbuf *m;
1956 		int error;
1957 
1958 		m = ifsq_dequeue(ifsq);
1959 		if (m == NULL)
1960 			goto done;
1961 
1962 		BPF_MTAP(ifp, m);
1963 		error = mxge_encap(tx, m, zeropad);
1964 		if (!error)
1965 			encap = 1;
1966 		else
1967 			IFNET_STAT_INC(ifp, oerrors, 1);
1968 	}
1969 
1970 	/* Ran out of transmit slots */
1971 	ifsq_set_oactive(ifsq);
1972 done:
1973 	if (encap)
1974 		ifp->if_timer = 5;
1975 }
1976 
1977 static void
1978 mxge_watchdog(struct ifnet *ifp)
1979 {
1980 	struct mxge_softc *sc = ifp->if_softc;
1981 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
1982 	mxge_tx_ring_t *tx = &sc->ss[0].tx;
1983 
1984 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
1985 
1986 	/* Check for pause blocking before resetting */
1987 	if (tx->watchdog_rx_pause == rx_pause) {
1988 		mxge_warn_stuck(sc, tx, 0);
1989 		mxge_watchdog_reset(sc);
1990 		return;
1991 	} else {
1992 		if_printf(ifp, "Flow control blocking xmits, "
1993 		    "check link partner\n");
1994 	}
1995 	tx->watchdog_rx_pause = rx_pause;
1996 }
1997 
1998 /*
1999  * Copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2000  * at most 32 bytes at a time, so as to avoid involving the software
2001  * pio handler in the nic.  We re-write the first segment's low
2002  * DMA address to mark it valid only after we write the entire chunk
2003  * in a burst
2004  */
2005 static __inline void
2006 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2007     mcp_kreq_ether_recv_t *src)
2008 {
2009 	uint32_t low;
2010 
2011 	low = src->addr_low;
2012 	src->addr_low = 0xffffffff;
2013 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2014 	wmb();
2015 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2016 	wmb();
2017 	src->addr_low = low;
2018 	dst->addr_low = low;
2019 	wmb();
2020 }
2021 
2022 static int
2023 mxge_get_buf_small(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2024     boolean_t init)
2025 {
2026 	bus_dma_segment_t seg;
2027 	struct mbuf *m;
2028 	int cnt, err, mflag;
2029 
2030 	mflag = MB_DONTWAIT;
2031 	if (__predict_false(init))
2032 		mflag = MB_WAIT;
2033 
2034 	m = m_gethdr(mflag, MT_DATA);
2035 	if (m == NULL) {
2036 		err = ENOBUFS;
2037 		if (__predict_false(init)) {
2038 			/*
2039 			 * During initialization, there
2040 			 * is nothing to setup; bail out
2041 			 */
2042 			return err;
2043 		}
2044 		goto done;
2045 	}
2046 	m->m_len = m->m_pkthdr.len = MHLEN;
2047 
2048 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2049 	    &seg, 1, &cnt, BUS_DMA_NOWAIT);
2050 	if (err != 0) {
2051 		m_freem(m);
2052 		if (__predict_false(init)) {
2053 			/*
2054 			 * During initialization, there
2055 			 * is nothing to setup; bail out
2056 			 */
2057 			return err;
2058 		}
2059 		goto done;
2060 	}
2061 
2062 	rx->info[idx].m = m;
2063 	rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2064 	rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2065 
2066 done:
2067 	if ((idx & 7) == 7)
2068 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2069 	return err;
2070 }
2071 
2072 static int
2073 mxge_get_buf_big(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2074     boolean_t init)
2075 {
2076 	bus_dma_segment_t seg;
2077 	struct mbuf *m;
2078 	int cnt, err, mflag;
2079 
2080 	mflag = MB_DONTWAIT;
2081 	if (__predict_false(init))
2082 		mflag = MB_WAIT;
2083 
2084 	if (rx->cl_size == MCLBYTES)
2085 		m = m_getcl(mflag, MT_DATA, M_PKTHDR);
2086 	else
2087 		m = m_getjcl(mflag, MT_DATA, M_PKTHDR, MJUMPAGESIZE);
2088 	if (m == NULL) {
2089 		err = ENOBUFS;
2090 		if (__predict_false(init)) {
2091 			/*
2092 			 * During initialization, there
2093 			 * is nothing to setup; bail out
2094 			 */
2095 			return err;
2096 		}
2097 		goto done;
2098 	}
2099 	m->m_len = m->m_pkthdr.len = rx->cl_size;
2100 
2101 	err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2102 	    &seg, 1, &cnt, BUS_DMA_NOWAIT);
2103 	if (err != 0) {
2104 		m_freem(m);
2105 		if (__predict_false(init)) {
2106 			/*
2107 			 * During initialization, there
2108 			 * is nothing to setup; bail out
2109 			 */
2110 			return err;
2111 		}
2112 		goto done;
2113 	}
2114 
2115 	rx->info[idx].m = m;
2116 	rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2117 	rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2118 
2119 done:
2120 	if ((idx & 7) == 7)
2121 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2122 	return err;
2123 }
2124 
2125 /*
2126  * Myri10GE hardware checksums are not valid if the sender
2127  * padded the frame with non-zero padding.  This is because
2128  * the firmware just does a simple 16-bit 1s complement
2129  * checksum across the entire frame, excluding the first 14
2130  * bytes.  It is best to simply to check the checksum and
2131  * tell the stack about it only if the checksum is good
2132  */
2133 static __inline uint16_t
2134 mxge_rx_csum(struct mbuf *m, int csum)
2135 {
2136 	const struct ether_header *eh;
2137 	const struct ip *ip;
2138 	uint16_t c;
2139 
2140 	eh = mtod(m, const struct ether_header *);
2141 
2142 	/* Only deal with IPv4 TCP & UDP for now */
2143 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2144 		return 1;
2145 
2146 	ip = (const struct ip *)(eh + 1);
2147 	if (__predict_false(ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP))
2148 		return 1;
2149 
2150 #ifdef INET
2151 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2152 	    htonl(ntohs(csum) + ntohs(ip->ip_len) +
2153 	          - (ip->ip_hl << 2) + ip->ip_p));
2154 #else
2155 	c = 1;
2156 #endif
2157 	c ^= 0xffff;
2158 	return c;
2159 }
2160 
2161 static void
2162 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2163 {
2164 	struct ether_vlan_header *evl;
2165 	uint32_t partial;
2166 
2167 	evl = mtod(m, struct ether_vlan_header *);
2168 
2169 	/*
2170 	 * Fix checksum by subtracting EVL_ENCAPLEN bytes after
2171 	 * what the firmware thought was the end of the ethernet
2172 	 * header.
2173 	 */
2174 
2175 	/* Put checksum into host byte order */
2176 	*csum = ntohs(*csum);
2177 
2178 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2179 	*csum += ~partial;
2180 	*csum += ((*csum) < ~partial);
2181 	*csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2182 	*csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2183 
2184 	/*
2185 	 * Restore checksum to network byte order;
2186 	 * later consumers expect this
2187 	 */
2188 	*csum = htons(*csum);
2189 
2190 	/* save the tag */
2191 	m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2192 	m->m_flags |= M_VLANTAG;
2193 
2194 	/*
2195 	 * Remove the 802.1q header by copying the Ethernet
2196 	 * addresses over it and adjusting the beginning of
2197 	 * the data in the mbuf.  The encapsulated Ethernet
2198 	 * type field is already in place.
2199 	 */
2200 	bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2201 	    ETHER_HDR_LEN - ETHER_TYPE_LEN);
2202 	m_adj(m, EVL_ENCAPLEN);
2203 }
2204 
2205 
2206 static __inline void
2207 mxge_rx_done_big(struct ifnet *ifp, mxge_rx_ring_t *rx,
2208     uint32_t len, uint32_t csum)
2209 {
2210 	struct mbuf *m;
2211 	const struct ether_header *eh;
2212 	bus_dmamap_t old_map;
2213 	int idx;
2214 
2215 	idx = rx->cnt & rx->mask;
2216 	rx->cnt++;
2217 
2218 	/* Save a pointer to the received mbuf */
2219 	m = rx->info[idx].m;
2220 
2221 	/* Try to replace the received mbuf */
2222 	if (mxge_get_buf_big(rx, rx->extra_map, idx, FALSE)) {
2223 		/* Drop the frame -- the old mbuf is re-cycled */
2224 		IFNET_STAT_INC(ifp, ierrors, 1);
2225 		return;
2226 	}
2227 
2228 	/* Unmap the received buffer */
2229 	old_map = rx->info[idx].map;
2230 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2231 	bus_dmamap_unload(rx->dmat, old_map);
2232 
2233 	/* Swap the bus_dmamap_t's */
2234 	rx->info[idx].map = rx->extra_map;
2235 	rx->extra_map = old_map;
2236 
2237 	/*
2238 	 * mcp implicitly skips 1st 2 bytes so that packet is properly
2239 	 * aligned
2240 	 */
2241 	m->m_data += MXGEFW_PAD;
2242 
2243 	m->m_pkthdr.rcvif = ifp;
2244 	m->m_len = m->m_pkthdr.len = len;
2245 
2246 	IFNET_STAT_INC(ifp, ipackets, 1);
2247 
2248 	eh = mtod(m, const struct ether_header *);
2249 	if (eh->ether_type == htons(ETHERTYPE_VLAN))
2250 		mxge_vlan_tag_remove(m, &csum);
2251 
2252 	/* If the checksum is valid, mark it in the mbuf header */
2253 	if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2254 	    mxge_rx_csum(m, csum) == 0) {
2255 		/* Tell the stack that the checksum is good */
2256 		m->m_pkthdr.csum_data = 0xffff;
2257 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2258 		    CSUM_DATA_VALID;
2259 	}
2260 	ifp->if_input(ifp, m);
2261 }
2262 
2263 static __inline void
2264 mxge_rx_done_small(struct ifnet *ifp, mxge_rx_ring_t *rx,
2265     uint32_t len, uint32_t csum)
2266 {
2267 	const struct ether_header *eh;
2268 	struct mbuf *m;
2269 	bus_dmamap_t old_map;
2270 	int idx;
2271 
2272 	idx = rx->cnt & rx->mask;
2273 	rx->cnt++;
2274 
2275 	/* Save a pointer to the received mbuf */
2276 	m = rx->info[idx].m;
2277 
2278 	/* Try to replace the received mbuf */
2279 	if (mxge_get_buf_small(rx, rx->extra_map, idx, FALSE)) {
2280 		/* Drop the frame -- the old mbuf is re-cycled */
2281 		IFNET_STAT_INC(ifp, ierrors, 1);
2282 		return;
2283 	}
2284 
2285 	/* Unmap the received buffer */
2286 	old_map = rx->info[idx].map;
2287 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2288 	bus_dmamap_unload(rx->dmat, old_map);
2289 
2290 	/* Swap the bus_dmamap_t's */
2291 	rx->info[idx].map = rx->extra_map;
2292 	rx->extra_map = old_map;
2293 
2294 	/*
2295 	 * mcp implicitly skips 1st 2 bytes so that packet is properly
2296 	 * aligned
2297 	 */
2298 	m->m_data += MXGEFW_PAD;
2299 
2300 	m->m_pkthdr.rcvif = ifp;
2301 	m->m_len = m->m_pkthdr.len = len;
2302 
2303 	IFNET_STAT_INC(ifp, ipackets, 1);
2304 
2305 	eh = mtod(m, const struct ether_header *);
2306 	if (eh->ether_type == htons(ETHERTYPE_VLAN))
2307 		mxge_vlan_tag_remove(m, &csum);
2308 
2309 	/* If the checksum is valid, mark it in the mbuf header */
2310 	if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2311 	    mxge_rx_csum(m, csum) == 0) {
2312 		/* Tell the stack that the checksum is good */
2313 		m->m_pkthdr.csum_data = 0xffff;
2314 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2315 		    CSUM_DATA_VALID;
2316 	}
2317 	ifp->if_input(ifp, m);
2318 }
2319 
2320 static __inline void
2321 mxge_clean_rx_done(struct ifnet *ifp, struct mxge_rx_data *rx_data)
2322 {
2323 	mxge_rx_done_t *rx_done = &rx_data->rx_done;
2324 
2325 	while (rx_done->entry[rx_done->idx].length != 0) {
2326 		uint16_t length, checksum;
2327 
2328 		length = ntohs(rx_done->entry[rx_done->idx].length);
2329 		rx_done->entry[rx_done->idx].length = 0;
2330 
2331 		checksum = rx_done->entry[rx_done->idx].checksum;
2332 
2333 		if (length <= MXGE_RX_SMALL_BUFLEN) {
2334 			mxge_rx_done_small(ifp, &rx_data->rx_small,
2335 			    length, checksum);
2336 		} else {
2337 			mxge_rx_done_big(ifp, &rx_data->rx_big,
2338 			    length, checksum);
2339 		}
2340 
2341 		rx_done->idx++;
2342 		rx_done->idx &= rx_done->mask;
2343 	}
2344 }
2345 
2346 static __inline void
2347 mxge_tx_done(struct ifnet *ifp, mxge_tx_ring_t *tx, uint32_t mcp_idx)
2348 {
2349 	ASSERT_SERIALIZED(&tx->tx_serialize);
2350 
2351 	while (tx->pkt_done != mcp_idx) {
2352 		struct mbuf *m;
2353 		int idx;
2354 
2355 		idx = tx->done & tx->mask;
2356 		tx->done++;
2357 
2358 		m = tx->info[idx].m;
2359 		/*
2360 		 * mbuf and DMA map only attached to the first
2361 		 * segment per-mbuf.
2362 		 */
2363 		if (m != NULL) {
2364 			tx->pkt_done++;
2365 			IFNET_STAT_INC(ifp, opackets, 1);
2366 			tx->info[idx].m = NULL;
2367 			bus_dmamap_unload(tx->dmat, tx->info[idx].map);
2368 			m_freem(m);
2369 		}
2370 	}
2371 
2372 	/*
2373 	 * If we have space, clear OACTIVE to tell the stack that
2374 	 * its OK to send packets
2375 	 */
2376 	if (tx->req - tx->done < (tx->mask + 1) / 2) {
2377 		ifq_clr_oactive(&ifp->if_snd);
2378 		if (tx->req == tx->done)
2379 			ifp->if_timer = 0;
2380 	}
2381 
2382 	if (!ifq_is_empty(&ifp->if_snd))
2383 		if_devstart(ifp);
2384 
2385 #ifdef IFNET_BUF_RING
2386 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2387 		/* let the NIC stop polling this queue, since there
2388 		 * are no more transmits pending */
2389 		if (tx->req == tx->done) {
2390 			*tx->send_stop = 1;
2391 			tx->queue_active = 0;
2392 			tx->deactivate++;
2393 			wmb();
2394 		}
2395 	}
2396 #endif
2397 }
2398 
2399 static struct mxge_media_type mxge_xfp_media_types[] = {
2400 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2401 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2402 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2403 	{0,		(1 << 5),	"10GBASE-ER"},
2404 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2405 	{0,		(1 << 3),	"10GBASE-SW"},
2406 	{0,		(1 << 2),	"10GBASE-LW"},
2407 	{0,		(1 << 1),	"10GBASE-EW"},
2408 	{0,		(1 << 0),	"Reserved"}
2409 };
2410 
2411 static struct mxge_media_type mxge_sfp_media_types[] = {
2412 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2413 	{0,		(1 << 7),	"Reserved"},
2414 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2415 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2416 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2417 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2418 };
2419 
2420 static void
2421 mxge_media_set(mxge_softc_t *sc, int media_type)
2422 {
2423 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type, 0, NULL);
2424 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2425 	sc->current_media = media_type;
2426 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2427 }
2428 
2429 static void
2430 mxge_media_init(mxge_softc_t *sc)
2431 {
2432 	const char *ptr;
2433 	int i;
2434 
2435 	ifmedia_removeall(&sc->media);
2436 	mxge_media_set(sc, IFM_AUTO);
2437 
2438 	/*
2439 	 * Parse the product code to deterimine the interface type
2440 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2441 	 * after the 3rd dash in the driver's cached copy of the
2442 	 * EEPROM's product code string.
2443 	 */
2444 	ptr = sc->product_code_string;
2445 	if (ptr == NULL) {
2446 		if_printf(sc->ifp, "Missing product code\n");
2447 		return;
2448 	}
2449 
2450 	for (i = 0; i < 3; i++, ptr++) {
2451 		ptr = strchr(ptr, '-');
2452 		if (ptr == NULL) {
2453 			if_printf(sc->ifp, "only %d dashes in PC?!?\n", i);
2454 			return;
2455 		}
2456 	}
2457 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2458 		/* -C is CX4 */
2459 		sc->connector = MXGE_CX4;
2460 		mxge_media_set(sc, IFM_10G_CX4);
2461 	} else if (*ptr == 'Q') {
2462 		/* -Q is Quad Ribbon Fiber */
2463 		sc->connector = MXGE_QRF;
2464 		if_printf(sc->ifp, "Quad Ribbon Fiber Media\n");
2465 		/* DragonFly has no media type for Quad ribbon fiber */
2466 	} else if (*ptr == 'R') {
2467 		/* -R is XFP */
2468 		sc->connector = MXGE_XFP;
2469 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2470 		/* -S or -2S is SFP+ */
2471 		sc->connector = MXGE_SFP;
2472 	} else {
2473 		if_printf(sc->ifp, "Unknown media type: %c\n", *ptr);
2474 	}
2475 }
2476 
2477 /*
2478  * Determine the media type for a NIC.  Some XFPs will identify
2479  * themselves only when their link is up, so this is initiated via a
2480  * link up interrupt.  However, this can potentially take up to
2481  * several milliseconds, so it is run via the watchdog routine, rather
2482  * than in the interrupt handler itself.
2483  */
2484 static void
2485 mxge_media_probe(mxge_softc_t *sc)
2486 {
2487 	mxge_cmd_t cmd;
2488 	const char *cage_type;
2489 	struct mxge_media_type *mxge_media_types = NULL;
2490 	int i, err, ms, mxge_media_type_entries;
2491 	uint32_t byte;
2492 
2493 	sc->need_media_probe = 0;
2494 
2495 	if (sc->connector == MXGE_XFP) {
2496 		/* -R is XFP */
2497 		mxge_media_types = mxge_xfp_media_types;
2498 		mxge_media_type_entries = sizeof(mxge_xfp_media_types) /
2499 		    sizeof(mxge_xfp_media_types[0]);
2500 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2501 		cage_type = "XFP";
2502 	} else 	if (sc->connector == MXGE_SFP) {
2503 		/* -S or -2S is SFP+ */
2504 		mxge_media_types = mxge_sfp_media_types;
2505 		mxge_media_type_entries = sizeof(mxge_sfp_media_types) /
2506 		    sizeof(mxge_sfp_media_types[0]);
2507 		cage_type = "SFP+";
2508 		byte = 3;
2509 	} else {
2510 		/* nothing to do; media type cannot change */
2511 		return;
2512 	}
2513 
2514 	/*
2515 	 * At this point we know the NIC has an XFP cage, so now we
2516 	 * try to determine what is in the cage by using the
2517 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2518 	 * register.  We read just one byte, which may take over
2519 	 * a millisecond
2520 	 */
2521 
2522 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2523 	cmd.data1 = byte;
2524 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2525 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE)
2526 		if_printf(sc->ifp, "failed to read XFP\n");
2527 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT)
2528 		if_printf(sc->ifp, "Type R/S with no XFP!?!?\n");
2529 	if (err != MXGEFW_CMD_OK)
2530 		return;
2531 
2532 	/* Now we wait for the data to be cached */
2533 	cmd.data0 = byte;
2534 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2535 	for (ms = 0; err == EBUSY && ms < 50; ms++) {
2536 		DELAY(1000);
2537 		cmd.data0 = byte;
2538 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2539 	}
2540 	if (err != MXGEFW_CMD_OK) {
2541 		if_printf(sc->ifp, "failed to read %s (%d, %dms)\n",
2542 		    cage_type, err, ms);
2543 		return;
2544 	}
2545 
2546 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2547 		if (bootverbose) {
2548 			if_printf(sc->ifp, "%s:%s\n", cage_type,
2549 			    mxge_media_types[0].name);
2550 		}
2551 		if (sc->current_media != mxge_media_types[0].flag) {
2552 			mxge_media_init(sc);
2553 			mxge_media_set(sc, mxge_media_types[0].flag);
2554 		}
2555 		return;
2556 	}
2557 	for (i = 1; i < mxge_media_type_entries; i++) {
2558 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2559 			if (bootverbose) {
2560 				if_printf(sc->ifp, "%s:%s\n", cage_type,
2561 				    mxge_media_types[i].name);
2562 			}
2563 
2564 			if (sc->current_media != mxge_media_types[i].flag) {
2565 				mxge_media_init(sc);
2566 				mxge_media_set(sc, mxge_media_types[i].flag);
2567 			}
2568 			return;
2569 		}
2570 	}
2571 	if (bootverbose) {
2572 		if_printf(sc->ifp, "%s media 0x%x unknown\n", cage_type,
2573 		    cmd.data0);
2574 	}
2575 }
2576 
2577 static void
2578 mxge_intr_status(struct mxge_softc *sc, const mcp_irq_data_t *stats)
2579 {
2580 	if (sc->link_state != stats->link_up) {
2581 		sc->link_state = stats->link_up;
2582 		if (sc->link_state) {
2583 			sc->ifp->if_link_state = LINK_STATE_UP;
2584 			if_link_state_change(sc->ifp);
2585 			if (bootverbose)
2586 				if_printf(sc->ifp, "link up\n");
2587 		} else {
2588 			sc->ifp->if_link_state = LINK_STATE_DOWN;
2589 			if_link_state_change(sc->ifp);
2590 			if (bootverbose)
2591 				if_printf(sc->ifp, "link down\n");
2592 		}
2593 		sc->need_media_probe = 1;
2594 	}
2595 
2596 	if (sc->rdma_tags_available != be32toh(stats->rdma_tags_available)) {
2597 		sc->rdma_tags_available = be32toh(stats->rdma_tags_available);
2598 		if_printf(sc->ifp, "RDMA timed out! %d tags left\n",
2599 		    sc->rdma_tags_available);
2600 	}
2601 
2602 	if (stats->link_down) {
2603 		sc->down_cnt += stats->link_down;
2604 		sc->link_state = 0;
2605 		sc->ifp->if_link_state = LINK_STATE_DOWN;
2606 		if_link_state_change(sc->ifp);
2607 	}
2608 }
2609 
2610 static void
2611 mxge_serialize_skipmain(struct mxge_softc *sc)
2612 {
2613 	lwkt_serialize_array_enter(sc->serializes, sc->nserialize, 1);
2614 }
2615 
2616 static void
2617 mxge_deserialize_skipmain(struct mxge_softc *sc)
2618 {
2619 	lwkt_serialize_array_exit(sc->serializes, sc->nserialize, 1);
2620 }
2621 
2622 static void
2623 mxge_legacy(void *arg)
2624 {
2625 	struct mxge_slice_state *ss = arg;
2626 	mxge_softc_t *sc = ss->sc;
2627 	mcp_irq_data_t *stats = ss->fw_stats;
2628 	mxge_tx_ring_t *tx = &ss->tx;
2629 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2630 	uint32_t send_done_count;
2631 	uint8_t valid;
2632 
2633 	ASSERT_SERIALIZED(&sc->main_serialize);
2634 
2635 #if 0
2636 	/* an interrupt on a non-zero slice is implicitly valid
2637 	   since MSI-X irqs are not shared */
2638 	if (ss != sc->ss) {
2639 		mxge_clean_rx_done(rx_done);
2640 		*ss->irq_claim = be32toh(3);
2641 		return;
2642 	}
2643 #endif
2644 
2645 	/* Make sure the DMA has finished */
2646 	if (!stats->valid)
2647 		return;
2648 	valid = stats->valid;
2649 
2650 	/* Lower legacy IRQ */
2651 	*sc->irq_deassert = 0;
2652 	if (!mxge_deassert_wait) {
2653 		/* Don't wait for conf. that irq is low */
2654 		stats->valid = 0;
2655 	}
2656 
2657 	mxge_serialize_skipmain(sc);
2658 
2659 	/*
2660 	 * Loop while waiting for legacy irq deassertion
2661 	 * XXX do we really want to loop?
2662 	 */
2663 	do {
2664 		/* Check for transmit completes and receives */
2665 		send_done_count = be32toh(stats->send_done_count);
2666 		while ((send_done_count != tx->pkt_done) ||
2667 		       (rx_done->entry[rx_done->idx].length != 0)) {
2668 			if (send_done_count != tx->pkt_done) {
2669 				mxge_tx_done(&sc->arpcom.ac_if, tx,
2670 				    (int)send_done_count);
2671 			}
2672 			mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data);
2673 			send_done_count = be32toh(stats->send_done_count);
2674 		}
2675 		if (mxge_deassert_wait)
2676 			wmb();
2677 	} while (*((volatile uint8_t *)&stats->valid));
2678 
2679 	mxge_deserialize_skipmain(sc);
2680 
2681 	/* Fw link & error stats meaningful only on the first slice */
2682 	if (__predict_false(stats->stats_updated))
2683 		mxge_intr_status(sc, stats);
2684 
2685 	/* Check to see if we have rx token to pass back */
2686 	if (valid & 0x1)
2687 	    *ss->irq_claim = be32toh(3);
2688 	*(ss->irq_claim + 1) = be32toh(3);
2689 }
2690 
2691 static void
2692 mxge_msi(void *arg)
2693 {
2694 	struct mxge_slice_state *ss = arg;
2695 	mxge_softc_t *sc = ss->sc;
2696 	mcp_irq_data_t *stats = ss->fw_stats;
2697 	mxge_tx_ring_t *tx = &ss->tx;
2698 	mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2699 	uint32_t send_done_count;
2700 	uint8_t valid;
2701 
2702 	ASSERT_SERIALIZED(&sc->main_serialize);
2703 
2704 	/* Make sure the DMA has finished */
2705 	if (__predict_false(!stats->valid))
2706 		return;
2707 
2708 	valid = stats->valid;
2709 	stats->valid = 0;
2710 
2711 	/* Check for receives */
2712 	lwkt_serialize_enter(&ss->rx_data.rx_serialize);
2713 	if (rx_done->entry[rx_done->idx].length != 0)
2714 		mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data);
2715 	lwkt_serialize_exit(&ss->rx_data.rx_serialize);
2716 
2717 	/*
2718 	 * Check for transmit completes
2719 	 *
2720 	 * NOTE:
2721 	 * Since pkt_done is only changed by mxge_tx_done(),
2722 	 * which is called only in interrupt handler, the
2723 	 * check w/o holding tx serializer is MPSAFE.
2724 	 */
2725 	send_done_count = be32toh(stats->send_done_count);
2726 	if (send_done_count != tx->pkt_done) {
2727 		lwkt_serialize_enter(&tx->tx_serialize);
2728 		mxge_tx_done(&sc->arpcom.ac_if, tx, (int)send_done_count);
2729 		lwkt_serialize_exit(&tx->tx_serialize);
2730 	}
2731 
2732 	if (__predict_false(stats->stats_updated))
2733 		mxge_intr_status(sc, stats);
2734 
2735 	/* Check to see if we have rx token to pass back */
2736 	if (valid & 0x1)
2737 	    *ss->irq_claim = be32toh(3);
2738 	*(ss->irq_claim + 1) = be32toh(3);
2739 }
2740 
2741 static void
2742 mxge_init(void *arg)
2743 {
2744 	struct mxge_softc *sc = arg;
2745 
2746 	ASSERT_IFNET_SERIALIZED_ALL(sc->ifp);
2747 	if ((sc->ifp->if_flags & IFF_RUNNING) == 0)
2748 		mxge_open(sc);
2749 }
2750 
2751 static void
2752 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2753 {
2754 	int i;
2755 
2756 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2757 		if (ss->rx_data.rx_big.info[i].m == NULL)
2758 			continue;
2759 		bus_dmamap_unload(ss->rx_data.rx_big.dmat,
2760 		    ss->rx_data.rx_big.info[i].map);
2761 		m_freem(ss->rx_data.rx_big.info[i].m);
2762 		ss->rx_data.rx_big.info[i].m = NULL;
2763 	}
2764 
2765 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2766 		if (ss->rx_data.rx_small.info[i].m == NULL)
2767 			continue;
2768 		bus_dmamap_unload(ss->rx_data.rx_small.dmat,
2769 		    ss->rx_data.rx_small.info[i].map);
2770 		m_freem(ss->rx_data.rx_small.info[i].m);
2771 		ss->rx_data.rx_small.info[i].m = NULL;
2772 	}
2773 
2774 	/* Transmit ring used only on the first slice */
2775 	if (ss->tx.info == NULL)
2776 		return;
2777 
2778 	for (i = 0; i <= ss->tx.mask; i++) {
2779 		if (ss->tx.info[i].m == NULL)
2780 			continue;
2781 		bus_dmamap_unload(ss->tx.dmat, ss->tx.info[i].map);
2782 		m_freem(ss->tx.info[i].m);
2783 		ss->tx.info[i].m = NULL;
2784 	}
2785 }
2786 
2787 static void
2788 mxge_free_mbufs(mxge_softc_t *sc)
2789 {
2790 	int slice;
2791 
2792 	for (slice = 0; slice < sc->num_slices; slice++)
2793 		mxge_free_slice_mbufs(&sc->ss[slice]);
2794 }
2795 
2796 static void
2797 mxge_free_slice_rings(struct mxge_slice_state *ss)
2798 {
2799 	int i;
2800 
2801 	if (ss->rx_data.rx_done.entry != NULL) {
2802 		mxge_dma_free(&ss->rx_done_dma);
2803 		ss->rx_data.rx_done.entry = NULL;
2804 	}
2805 
2806 	if (ss->tx.req_list != NULL) {
2807 		kfree(ss->tx.req_list, M_DEVBUF);
2808 		ss->tx.req_list = NULL;
2809 	}
2810 
2811 	if (ss->tx.seg_list != NULL) {
2812 		kfree(ss->tx.seg_list, M_DEVBUF);
2813 		ss->tx.seg_list = NULL;
2814 	}
2815 
2816 	if (ss->rx_data.rx_small.shadow != NULL) {
2817 		kfree(ss->rx_data.rx_small.shadow, M_DEVBUF);
2818 		ss->rx_data.rx_small.shadow = NULL;
2819 	}
2820 
2821 	if (ss->rx_data.rx_big.shadow != NULL) {
2822 		kfree(ss->rx_data.rx_big.shadow, M_DEVBUF);
2823 		ss->rx_data.rx_big.shadow = NULL;
2824 	}
2825 
2826 	if (ss->tx.info != NULL) {
2827 		if (ss->tx.dmat != NULL) {
2828 			for (i = 0; i <= ss->tx.mask; i++) {
2829 				bus_dmamap_destroy(ss->tx.dmat,
2830 				    ss->tx.info[i].map);
2831 			}
2832 			bus_dma_tag_destroy(ss->tx.dmat);
2833 		}
2834 		kfree(ss->tx.info, M_DEVBUF);
2835 		ss->tx.info = NULL;
2836 	}
2837 
2838 	if (ss->rx_data.rx_small.info != NULL) {
2839 		if (ss->rx_data.rx_small.dmat != NULL) {
2840 			for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2841 				bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2842 				    ss->rx_data.rx_small.info[i].map);
2843 			}
2844 			bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2845 			    ss->rx_data.rx_small.extra_map);
2846 			bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
2847 		}
2848 		kfree(ss->rx_data.rx_small.info, M_DEVBUF);
2849 		ss->rx_data.rx_small.info = NULL;
2850 	}
2851 
2852 	if (ss->rx_data.rx_big.info != NULL) {
2853 		if (ss->rx_data.rx_big.dmat != NULL) {
2854 			for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2855 				bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2856 				    ss->rx_data.rx_big.info[i].map);
2857 			}
2858 			bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2859 			    ss->rx_data.rx_big.extra_map);
2860 			bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
2861 		}
2862 		kfree(ss->rx_data.rx_big.info, M_DEVBUF);
2863 		ss->rx_data.rx_big.info = NULL;
2864 	}
2865 }
2866 
2867 static void
2868 mxge_free_rings(mxge_softc_t *sc)
2869 {
2870 	int slice;
2871 
2872 	if (sc->ss == NULL)
2873 		return;
2874 
2875 	for (slice = 0; slice < sc->num_slices; slice++)
2876 		mxge_free_slice_rings(&sc->ss[slice]);
2877 }
2878 
2879 static int
2880 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2881     int tx_ring_entries)
2882 {
2883 	mxge_softc_t *sc = ss->sc;
2884 	size_t bytes;
2885 	int err, i;
2886 
2887 	/*
2888 	 * Allocate per-slice receive resources
2889 	 */
2890 
2891 	ss->rx_data.rx_small.mask = ss->rx_data.rx_big.mask =
2892 	    rx_ring_entries - 1;
2893 	ss->rx_data.rx_done.mask = (2 * rx_ring_entries) - 1;
2894 
2895 	/* Allocate the rx shadow rings */
2896 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.shadow);
2897 	ss->rx_data.rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2898 
2899 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.shadow);
2900 	ss->rx_data.rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2901 
2902 	/* Allocate the rx host info rings */
2903 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.info);
2904 	ss->rx_data.rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2905 
2906 	bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.info);
2907 	ss->rx_data.rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2908 
2909 	/* Allocate the rx busdma resources */
2910 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2911 				 1,			/* alignment */
2912 				 4096,			/* boundary */
2913 				 BUS_SPACE_MAXADDR,	/* low */
2914 				 BUS_SPACE_MAXADDR,	/* high */
2915 				 NULL, NULL,		/* filter */
2916 				 MHLEN,			/* maxsize */
2917 				 1,			/* num segs */
2918 				 MHLEN,			/* maxsegsize */
2919 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
2920 				 			/* flags */
2921 				 &ss->rx_data.rx_small.dmat); /* tag */
2922 	if (err != 0) {
2923 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2924 		    err);
2925 		return err;
2926 	}
2927 
2928 	err = bus_dmamap_create(ss->rx_data.rx_small.dmat, BUS_DMA_WAITOK,
2929 	    &ss->rx_data.rx_small.extra_map);
2930 	if (err != 0) {
2931 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n", err);
2932 		bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
2933 		ss->rx_data.rx_small.dmat = NULL;
2934 		return err;
2935 	}
2936 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2937 		err = bus_dmamap_create(ss->rx_data.rx_small.dmat,
2938 		    BUS_DMA_WAITOK, &ss->rx_data.rx_small.info[i].map);
2939 		if (err != 0) {
2940 			int j;
2941 
2942 			device_printf(sc->dev, "Err %d rx_small dmamap\n", err);
2943 
2944 			for (j = 0; j < i; ++j) {
2945 				bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2946 				    ss->rx_data.rx_small.info[j].map);
2947 			}
2948 			bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2949 			    ss->rx_data.rx_small.extra_map);
2950 			bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
2951 			ss->rx_data.rx_small.dmat = NULL;
2952 			return err;
2953 		}
2954 	}
2955 
2956 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2957 				 1,			/* alignment */
2958 				 4096,			/* boundary */
2959 				 BUS_SPACE_MAXADDR,	/* low */
2960 				 BUS_SPACE_MAXADDR,	/* high */
2961 				 NULL, NULL,		/* filter */
2962 				 4096,			/* maxsize */
2963 				 1,			/* num segs */
2964 				 4096,			/* maxsegsize*/
2965 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
2966 				 			/* flags */
2967 				 &ss->rx_data.rx_big.dmat); /* tag */
2968 	if (err != 0) {
2969 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2970 		    err);
2971 		return err;
2972 	}
2973 
2974 	err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
2975 	    &ss->rx_data.rx_big.extra_map);
2976 	if (err != 0) {
2977 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n", err);
2978 		bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
2979 		ss->rx_data.rx_big.dmat = NULL;
2980 		return err;
2981 	}
2982 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2983 		err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
2984 		    &ss->rx_data.rx_big.info[i].map);
2985 		if (err != 0) {
2986 			int j;
2987 
2988 			device_printf(sc->dev, "Err %d rx_big dmamap\n", err);
2989 			for (j = 0; j < i; ++j) {
2990 				bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2991 				    ss->rx_data.rx_big.info[j].map);
2992 			}
2993 			bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2994 			    ss->rx_data.rx_big.extra_map);
2995 			bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
2996 			ss->rx_data.rx_big.dmat = NULL;
2997 			return err;
2998 		}
2999 	}
3000 
3001 	/*
3002 	 * Now allocate TX resources
3003 	 */
3004 
3005 	ss->tx.mask = tx_ring_entries - 1;
3006 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3007 
3008 	/*
3009 	 * Allocate the tx request copy block; MUST be at least 8 bytes
3010 	 * aligned
3011 	 */
3012 	bytes = sizeof(*ss->tx.req_list) * (ss->tx.max_desc + 4);
3013 	ss->tx.req_list = kmalloc_cachealign(__VM_CACHELINE_ALIGN(bytes),
3014 	    M_DEVBUF, M_WAITOK);
3015 
3016 	/* Allocate the tx busdma segment list */
3017 	bytes = sizeof(*ss->tx.seg_list) * ss->tx.max_desc;
3018 	ss->tx.seg_list = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3019 
3020 	/* Allocate the tx host info ring */
3021 	bytes = tx_ring_entries * sizeof(*ss->tx.info);
3022 	ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3023 
3024 	/* Allocate the tx busdma resources */
3025 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3026 				 1,			/* alignment */
3027 				 sc->tx_boundary,	/* boundary */
3028 				 BUS_SPACE_MAXADDR,	/* low */
3029 				 BUS_SPACE_MAXADDR,	/* high */
3030 				 NULL, NULL,		/* filter */
3031 				 IP_MAXPACKET +
3032 				 sizeof(struct ether_vlan_header),
3033 				 			/* maxsize */
3034 				 ss->tx.max_desc - 2,	/* num segs */
3035 				 sc->tx_boundary,	/* maxsegsz */
3036 				 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW |
3037 				 BUS_DMA_ONEBPAGE,	/* flags */
3038 				 &ss->tx.dmat);		/* tag */
3039 	if (err != 0) {
3040 		device_printf(sc->dev, "Err %d allocating tx dmat\n", err);
3041 		return err;
3042 	}
3043 
3044 	/*
3045 	 * Now use these tags to setup DMA maps for each slot in the ring
3046 	 */
3047 	for (i = 0; i <= ss->tx.mask; i++) {
3048 		err = bus_dmamap_create(ss->tx.dmat,
3049 		    BUS_DMA_WAITOK | BUS_DMA_ONEBPAGE, &ss->tx.info[i].map);
3050 		if (err != 0) {
3051 			int j;
3052 
3053 			device_printf(sc->dev, "Err %d tx dmamap\n", err);
3054 			for (j = 0; j < i; ++j) {
3055 				bus_dmamap_destroy(ss->tx.dmat,
3056 				    ss->tx.info[j].map);
3057 			}
3058 			bus_dma_tag_destroy(ss->tx.dmat);
3059 			ss->tx.dmat = NULL;
3060 			return err;
3061 		}
3062 	}
3063 	return 0;
3064 }
3065 
3066 static int
3067 mxge_alloc_rings(mxge_softc_t *sc)
3068 {
3069 	mxge_cmd_t cmd;
3070 	int tx_ring_size;
3071 	int tx_ring_entries, rx_ring_entries;
3072 	int err, slice;
3073 
3074 	/* Get ring sizes */
3075 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3076 	if (err != 0) {
3077 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3078 		return err;
3079 	}
3080 	tx_ring_size = cmd.data0;
3081 
3082 	tx_ring_entries = tx_ring_size / sizeof(mcp_kreq_ether_send_t);
3083 	rx_ring_entries = sc->rx_intr_slots / 2;
3084 	ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3085 	ifq_set_ready(&sc->ifp->if_snd);
3086 
3087 	for (slice = 0; slice < sc->num_slices; slice++) {
3088 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3089 		    rx_ring_entries, tx_ring_entries);
3090 		if (err != 0) {
3091 			device_printf(sc->dev,
3092 			    "alloc %d slice rings failed\n", slice);
3093 			return err;
3094 		}
3095 	}
3096 	return 0;
3097 }
3098 
3099 static void
3100 mxge_choose_params(int mtu, int *cl_size)
3101 {
3102 	int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3103 
3104 	if (bufsize < MCLBYTES) {
3105 		*cl_size = MCLBYTES;
3106 	} else {
3107 		KASSERT(bufsize < MJUMPAGESIZE, ("invalid MTU %d", mtu));
3108 		*cl_size = MJUMPAGESIZE;
3109 	}
3110 }
3111 
3112 static int
3113 mxge_slice_open(struct mxge_slice_state *ss, int cl_size)
3114 {
3115 	mxge_cmd_t cmd;
3116 	int err, i, slice;
3117 
3118 	slice = ss - ss->sc->ss;
3119 
3120 	/*
3121 	 * Get the lanai pointers to the send and receive rings
3122 	 */
3123 	err = 0;
3124 #ifndef IFNET_BUF_RING
3125 	/* We currently only send from the first slice */
3126 	if (slice == 0) {
3127 #endif
3128 		cmd.data0 = slice;
3129 		err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3130 		ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3131 		    (ss->sc->sram + cmd.data0);
3132 		ss->tx.send_go = (volatile uint32_t *)
3133 		    (ss->sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3134 		ss->tx.send_stop = (volatile uint32_t *)
3135 		    (ss->sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3136 #ifndef IFNET_BUF_RING
3137 	}
3138 #endif
3139 
3140 	cmd.data0 = slice;
3141 	err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3142 	ss->rx_data.rx_small.lanai =
3143 	    (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3144 
3145 	cmd.data0 = slice;
3146 	err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3147 	ss->rx_data.rx_big.lanai =
3148 	    (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3149 
3150 	if (err != 0) {
3151 		if_printf(ss->sc->ifp,
3152 		    "failed to get ring sizes or locations\n");
3153 		return EIO;
3154 	}
3155 
3156 	/*
3157 	 * Stock small receive ring
3158 	 */
3159 	for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
3160 		err = mxge_get_buf_small(&ss->rx_data.rx_small,
3161 		    ss->rx_data.rx_small.info[i].map, i, TRUE);
3162 		if (err) {
3163 			if_printf(ss->sc->ifp, "alloced %d/%d smalls\n", i,
3164 			    ss->rx_data.rx_small.mask + 1);
3165 			return ENOMEM;
3166 		}
3167 	}
3168 
3169 	/*
3170 	 * Stock big receive ring
3171 	 */
3172 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3173 		ss->rx_data.rx_big.shadow[i].addr_low = 0xffffffff;
3174 		ss->rx_data.rx_big.shadow[i].addr_high = 0xffffffff;
3175 	}
3176 
3177 	ss->rx_data.rx_big.cl_size = cl_size;
3178 
3179 	for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3180 		err = mxge_get_buf_big(&ss->rx_data.rx_big,
3181 		    ss->rx_data.rx_big.info[i].map, i, TRUE);
3182 		if (err) {
3183 			if_printf(ss->sc->ifp, "alloced %d/%d bigs\n", i,
3184 			    ss->rx_data.rx_big.mask + 1);
3185 			return ENOMEM;
3186 		}
3187 	}
3188 	return 0;
3189 }
3190 
3191 static int
3192 mxge_open(mxge_softc_t *sc)
3193 {
3194 	struct ifnet *ifp = sc->ifp;
3195 	mxge_cmd_t cmd;
3196 	int err, slice, cl_size, i;
3197 	bus_addr_t bus;
3198 	volatile uint8_t *itable;
3199 	struct mxge_slice_state *ss;
3200 
3201 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3202 
3203 	/* Copy the MAC address in case it was overridden */
3204 	bcopy(IF_LLADDR(ifp), sc->mac_addr, ETHER_ADDR_LEN);
3205 
3206 	err = mxge_reset(sc, 1);
3207 	if (err != 0) {
3208 		if_printf(ifp, "failed to reset\n");
3209 		return EIO;
3210 	}
3211 
3212 	if (sc->num_slices > 1) {
3213 		/* Setup the indirection table */
3214 		cmd.data0 = sc->num_slices;
3215 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE, &cmd);
3216 
3217 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
3218 		if (err != 0) {
3219 			if_printf(ifp, "failed to setup rss tables\n");
3220 			return err;
3221 		}
3222 
3223 		/* Just enable an identity mapping */
3224 		itable = sc->sram + cmd.data0;
3225 		for (i = 0; i < sc->num_slices; i++)
3226 			itable[i] = (uint8_t)i;
3227 
3228 		cmd.data0 = 1;
3229 		cmd.data1 = MXGEFW_RSS_HASH_TYPE_TCP_IPV4;
3230 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3231 		if (err != 0) {
3232 			if_printf(ifp, "failed to enable slices\n");
3233 			return err;
3234 		}
3235 	}
3236 
3237 	cmd.data0 = MXGEFW_TSO_MODE_NDIS;
3238 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_TSO_MODE, &cmd);
3239 	if (err) {
3240 		/*
3241 		 * Can't change TSO mode to NDIS, never allow TSO then
3242 		 */
3243 		if_printf(ifp, "failed to set TSO mode\n");
3244 		ifp->if_capenable &= ~IFCAP_TSO;
3245 		ifp->if_capabilities &= ~IFCAP_TSO;
3246 		ifp->if_hwassist &= ~CSUM_TSO;
3247 	}
3248 
3249 	mxge_choose_params(ifp->if_mtu, &cl_size);
3250 
3251 	cmd.data0 = 1;
3252 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd);
3253 	/*
3254 	 * Error is only meaningful if we're trying to set
3255 	 * MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1
3256 	 */
3257 
3258 	/*
3259 	 * Give the firmware the mtu and the big and small buffer
3260 	 * sizes.  The firmware wants the big buf size to be a power
3261 	 * of two. Luckily, DragonFly's clusters are powers of two
3262 	 */
3263 	cmd.data0 = ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3264 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3265 
3266 	cmd.data0 = MXGE_RX_SMALL_BUFLEN;
3267 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
3268 
3269 	cmd.data0 = cl_size;
3270 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3271 
3272 	if (err != 0) {
3273 		if_printf(ifp, "failed to setup params\n");
3274 		goto abort;
3275 	}
3276 
3277 	/* Now give him the pointer to the stats block */
3278 	for (slice = 0; slice < sc->num_slices; slice++) {
3279 		ss = &sc->ss[slice];
3280 		cmd.data0 = MXGE_LOWPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3281 		cmd.data1 = MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3282 		cmd.data2 = sizeof(struct mcp_irq_data);
3283 		cmd.data2 |= (slice << 16);
3284 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3285 	}
3286 
3287 	if (err != 0) {
3288 		bus = sc->ss->fw_stats_dma.dmem_busaddr;
3289 		bus += offsetof(struct mcp_irq_data, send_done_count);
3290 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3291 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3292 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3293 		    &cmd);
3294 
3295 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3296 		sc->fw_multicast_support = 0;
3297 	} else {
3298 		sc->fw_multicast_support = 1;
3299 	}
3300 
3301 	if (err != 0) {
3302 		if_printf(ifp, "failed to setup params\n");
3303 		goto abort;
3304 	}
3305 
3306 	for (slice = 0; slice < sc->num_slices; slice++) {
3307 		err = mxge_slice_open(&sc->ss[slice], cl_size);
3308 		if (err != 0) {
3309 			if_printf(ifp, "couldn't open slice %d\n", slice);
3310 			goto abort;
3311 		}
3312 	}
3313 
3314 	/* Finally, start the firmware running */
3315 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3316 	if (err) {
3317 		if_printf(ifp, "Couldn't bring up link\n");
3318 		goto abort;
3319 	}
3320 	ifp->if_flags |= IFF_RUNNING;
3321 	ifq_clr_oactive(&ifp->if_snd);
3322 	ifp->if_timer = 0;
3323 
3324 	return 0;
3325 
3326 abort:
3327 	mxge_free_mbufs(sc);
3328 	return err;
3329 }
3330 
3331 static void
3332 mxge_close(mxge_softc_t *sc, int down)
3333 {
3334 	struct ifnet *ifp = sc->ifp;
3335 	mxge_cmd_t cmd;
3336 	int err, old_down_cnt;
3337 
3338 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3339 
3340 	ifp->if_flags &= ~IFF_RUNNING;
3341 	ifq_clr_oactive(&ifp->if_snd);
3342 	ifp->if_timer = 0;
3343 
3344 	if (!down) {
3345 		old_down_cnt = sc->down_cnt;
3346 		wmb();
3347 
3348 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3349 		if (err)
3350 			if_printf(ifp, "Couldn't bring down link\n");
3351 
3352 		if (old_down_cnt == sc->down_cnt) {
3353 			/* Wait for down irq */
3354 			ifnet_deserialize_all(ifp);
3355 			DELAY(10 * sc->intr_coal_delay);
3356 			ifnet_serialize_all(ifp);
3357 		}
3358 
3359 		wmb();
3360 		if (old_down_cnt == sc->down_cnt)
3361 			if_printf(ifp, "never got down irq\n");
3362 	}
3363 	mxge_free_mbufs(sc);
3364 }
3365 
3366 static void
3367 mxge_setup_cfg_space(mxge_softc_t *sc)
3368 {
3369 	device_t dev = sc->dev;
3370 	int reg;
3371 	uint16_t lnk, pectl;
3372 
3373 	/* Find the PCIe link width and set max read request to 4KB */
3374 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3375 		lnk = pci_read_config(dev, reg + 0x12, 2);
3376 		sc->link_width = (lnk >> 4) & 0x3f;
3377 
3378 		if (sc->pectl == 0) {
3379 			pectl = pci_read_config(dev, reg + 0x8, 2);
3380 			pectl = (pectl & ~0x7000) | (5 << 12);
3381 			pci_write_config(dev, reg + 0x8, pectl, 2);
3382 			sc->pectl = pectl;
3383 		} else {
3384 			/* Restore saved pectl after watchdog reset */
3385 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3386 		}
3387 	}
3388 
3389 	/* Enable DMA and memory space access */
3390 	pci_enable_busmaster(dev);
3391 }
3392 
3393 static uint32_t
3394 mxge_read_reboot(mxge_softc_t *sc)
3395 {
3396 	device_t dev = sc->dev;
3397 	uint32_t vs;
3398 
3399 	/* Find the vendor specific offset */
3400 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3401 		if_printf(sc->ifp, "could not find vendor specific offset\n");
3402 		return (uint32_t)-1;
3403 	}
3404 	/* Enable read32 mode */
3405 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3406 	/* Tell NIC which register to read */
3407 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3408 	return pci_read_config(dev, vs + 0x14, 4);
3409 }
3410 
3411 static void
3412 mxge_watchdog_reset(mxge_softc_t *sc)
3413 {
3414 	struct pci_devinfo *dinfo;
3415 	int err, running;
3416 	uint32_t reboot;
3417 	uint16_t cmd;
3418 
3419 	err = ENXIO;
3420 
3421 	if_printf(sc->ifp, "Watchdog reset!\n");
3422 
3423 	/*
3424 	 * Check to see if the NIC rebooted.  If it did, then all of
3425 	 * PCI config space has been reset, and things like the
3426 	 * busmaster bit will be zero.  If this is the case, then we
3427 	 * must restore PCI config space before the NIC can be used
3428 	 * again
3429 	 */
3430 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3431 	if (cmd == 0xffff) {
3432 		/*
3433 		 * Maybe the watchdog caught the NIC rebooting; wait
3434 		 * up to 100ms for it to finish.  If it does not come
3435 		 * back, then give up
3436 		 */
3437 		DELAY(1000*100);
3438 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3439 		if (cmd == 0xffff)
3440 			if_printf(sc->ifp, "NIC disappeared!\n");
3441 	}
3442 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3443 		/* Print the reboot status */
3444 		reboot = mxge_read_reboot(sc);
3445 		if_printf(sc->ifp, "NIC rebooted, status = 0x%x\n", reboot);
3446 
3447 		running = sc->ifp->if_flags & IFF_RUNNING;
3448 		if (running) {
3449 			/*
3450 			 * Quiesce NIC so that TX routines will not try to
3451 			 * xmit after restoration of BAR
3452 			 */
3453 
3454 			/* Mark the link as down */
3455 			if (sc->link_state) {
3456 				sc->ifp->if_link_state = LINK_STATE_DOWN;
3457 				if_link_state_change(sc->ifp);
3458 			}
3459 			mxge_close(sc, 1);
3460 		}
3461 		/* Restore PCI configuration space */
3462 		dinfo = device_get_ivars(sc->dev);
3463 		pci_cfg_restore(sc->dev, dinfo);
3464 
3465 		/* And redo any changes we made to our config space */
3466 		mxge_setup_cfg_space(sc);
3467 
3468 		/* Reload f/w */
3469 		err = mxge_load_firmware(sc, 0);
3470 		if (err)
3471 			if_printf(sc->ifp, "Unable to re-load f/w\n");
3472 		if (running && !err) {
3473 			err = mxge_open(sc);
3474 			if_devstart_sched(sc->ifp);
3475 		}
3476 		sc->watchdog_resets++;
3477 	} else {
3478 		if_printf(sc->ifp, "NIC did not reboot, not resetting\n");
3479 		err = 0;
3480 	}
3481 	if (err) {
3482 		if_printf(sc->ifp, "watchdog reset failed\n");
3483 	} else {
3484 		if (sc->dying == 2)
3485 			sc->dying = 0;
3486 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3487 	}
3488 }
3489 
3490 static void
3491 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3492 {
3493 	if_printf(sc->ifp, "slice %d struck? ring state:\n", slice);
3494 	if_printf(sc->ifp, "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3495 	    tx->req, tx->done, tx->queue_active);
3496 	if_printf(sc->ifp, "tx.activate=%d tx.deactivate=%d\n",
3497 	    tx->activate, tx->deactivate);
3498 	if_printf(sc->ifp, "pkt_done=%d fw=%d\n",
3499 	    tx->pkt_done, be32toh(sc->ss->fw_stats->send_done_count));
3500 }
3501 
3502 static u_long
3503 mxge_update_stats(mxge_softc_t *sc)
3504 {
3505 	u_long ipackets, opackets, pkts;
3506 
3507 	IFNET_STAT_GET(sc->ifp, ipackets, ipackets);
3508 	IFNET_STAT_GET(sc->ifp, opackets, opackets);
3509 
3510 	pkts = ipackets - sc->ipackets;
3511 	pkts += opackets - sc->opackets;
3512 
3513 	sc->ipackets = ipackets;
3514 	sc->opackets = opackets;
3515 
3516 	return pkts;
3517 }
3518 
3519 static void
3520 mxge_tick(void *arg)
3521 {
3522 	mxge_softc_t *sc = arg;
3523 	u_long pkts = 0;
3524 	int err = 0;
3525 	int ticks;
3526 
3527 	lwkt_serialize_enter(&sc->main_serialize);
3528 
3529 	ticks = mxge_ticks;
3530 	if (sc->ifp->if_flags & IFF_RUNNING) {
3531 		/* Aggregate stats from different slices */
3532 		pkts = mxge_update_stats(sc);
3533 		if (sc->need_media_probe)
3534 			mxge_media_probe(sc);
3535 	}
3536 	if (pkts == 0) {
3537 		uint16_t cmd;
3538 
3539 		/* Ensure NIC did not suffer h/w fault while idle */
3540 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3541 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3542 			sc->dying = 2;
3543 			mxge_serialize_skipmain(sc);
3544 			mxge_watchdog_reset(sc);
3545 			mxge_deserialize_skipmain(sc);
3546 			err = ENXIO;
3547 		}
3548 
3549 		/* Look less often if NIC is idle */
3550 		ticks *= 4;
3551 	}
3552 
3553 	if (err == 0)
3554 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3555 
3556 	lwkt_serialize_exit(&sc->main_serialize);
3557 }
3558 
3559 static int
3560 mxge_media_change(struct ifnet *ifp)
3561 {
3562 	return EINVAL;
3563 }
3564 
3565 static int
3566 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3567 {
3568 	struct ifnet *ifp = sc->ifp;
3569 	int real_mtu, old_mtu;
3570 	int err = 0;
3571 
3572 	real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3573 	if (mtu > sc->max_mtu || real_mtu < 60)
3574 		return EINVAL;
3575 
3576 	old_mtu = ifp->if_mtu;
3577 	ifp->if_mtu = mtu;
3578 	if (ifp->if_flags & IFF_RUNNING) {
3579 		mxge_close(sc, 0);
3580 		err = mxge_open(sc);
3581 		if (err != 0) {
3582 			ifp->if_mtu = old_mtu;
3583 			mxge_close(sc, 0);
3584 			mxge_open(sc);
3585 		}
3586 	}
3587 	return err;
3588 }
3589 
3590 static void
3591 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3592 {
3593 	mxge_softc_t *sc = ifp->if_softc;
3594 
3595 
3596 	if (sc == NULL)
3597 		return;
3598 	ifmr->ifm_status = IFM_AVALID;
3599 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
3600 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3601 	ifmr->ifm_active |= sc->current_media;
3602 }
3603 
3604 static int
3605 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data,
3606     struct ucred *cr __unused)
3607 {
3608 	mxge_softc_t *sc = ifp->if_softc;
3609 	struct ifreq *ifr = (struct ifreq *)data;
3610 	int err, mask;
3611 
3612 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
3613 	err = 0;
3614 
3615 	switch (command) {
3616 	case SIOCSIFMTU:
3617 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3618 		break;
3619 
3620 	case SIOCSIFFLAGS:
3621 		if (sc->dying)
3622 			return EINVAL;
3623 
3624 		if (ifp->if_flags & IFF_UP) {
3625 			if (!(ifp->if_flags & IFF_RUNNING)) {
3626 				err = mxge_open(sc);
3627 			} else {
3628 				/*
3629 				 * Take care of PROMISC and ALLMULTI
3630 				 * flag changes
3631 				 */
3632 				mxge_change_promisc(sc,
3633 				    ifp->if_flags & IFF_PROMISC);
3634 				mxge_set_multicast_list(sc);
3635 			}
3636 		} else {
3637 			if (ifp->if_flags & IFF_RUNNING)
3638 				mxge_close(sc, 0);
3639 		}
3640 		break;
3641 
3642 	case SIOCADDMULTI:
3643 	case SIOCDELMULTI:
3644 		mxge_set_multicast_list(sc);
3645 		break;
3646 
3647 	case SIOCSIFCAP:
3648 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3649 		if (mask & IFCAP_TXCSUM) {
3650 			ifp->if_capenable ^= IFCAP_TXCSUM;
3651 			if (ifp->if_capenable & IFCAP_TXCSUM)
3652 				ifp->if_hwassist |= CSUM_TCP | CSUM_UDP;
3653 			else
3654 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
3655 		}
3656 		if (mask & IFCAP_TSO) {
3657 			ifp->if_capenable ^= IFCAP_TSO;
3658 			if (ifp->if_capenable & IFCAP_TSO)
3659 				ifp->if_hwassist |= CSUM_TSO;
3660 			else
3661 				ifp->if_hwassist &= ~CSUM_TSO;
3662 		}
3663 		if (mask & IFCAP_RXCSUM)
3664 			ifp->if_capenable ^= IFCAP_RXCSUM;
3665 		if (mask & IFCAP_VLAN_HWTAGGING)
3666 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3667 		break;
3668 
3669 	case SIOCGIFMEDIA:
3670 		mxge_media_probe(sc);
3671 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3672 		    &sc->media, command);
3673 		break;
3674 
3675 	default:
3676 		err = ether_ioctl(ifp, command, data);
3677 		break;
3678 	}
3679 	return err;
3680 }
3681 
3682 static void
3683 mxge_fetch_tunables(mxge_softc_t *sc)
3684 {
3685 	sc->intr_coal_delay = mxge_intr_coal_delay;
3686 	if (sc->intr_coal_delay < 0 || sc->intr_coal_delay > (10 * 1000))
3687 		sc->intr_coal_delay = MXGE_INTR_COAL_DELAY;
3688 
3689 	/* XXX */
3690 	if (mxge_ticks == 0)
3691 		mxge_ticks = hz / 2;
3692 
3693 	sc->pause = mxge_flow_control;
3694 
3695 	sc->throttle = mxge_throttle;
3696 	if (sc->throttle && sc->throttle > MXGE_MAX_THROTTLE)
3697 		sc->throttle = MXGE_MAX_THROTTLE;
3698 	if (sc->throttle && sc->throttle < MXGE_MIN_THROTTLE)
3699 		sc->throttle = MXGE_MIN_THROTTLE;
3700 }
3701 
3702 static void
3703 mxge_free_slices(mxge_softc_t *sc)
3704 {
3705 	struct mxge_slice_state *ss;
3706 	int i;
3707 
3708 	if (sc->ss == NULL)
3709 		return;
3710 
3711 	for (i = 0; i < sc->num_slices; i++) {
3712 		ss = &sc->ss[i];
3713 		if (ss->fw_stats != NULL) {
3714 			mxge_dma_free(&ss->fw_stats_dma);
3715 			ss->fw_stats = NULL;
3716 		}
3717 		if (ss->rx_data.rx_done.entry != NULL) {
3718 			mxge_dma_free(&ss->rx_done_dma);
3719 			ss->rx_data.rx_done.entry = NULL;
3720 		}
3721 	}
3722 	kfree(sc->ss, M_DEVBUF);
3723 	sc->ss = NULL;
3724 }
3725 
3726 static int
3727 mxge_alloc_slices(mxge_softc_t *sc)
3728 {
3729 	mxge_cmd_t cmd;
3730 	struct mxge_slice_state *ss;
3731 	size_t bytes;
3732 	int err, i, rx_ring_size;
3733 
3734 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3735 	if (err != 0) {
3736 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3737 		return err;
3738 	}
3739 	rx_ring_size = cmd.data0;
3740 	sc->rx_intr_slots = 2 * (rx_ring_size / sizeof (mcp_dma_addr_t));
3741 
3742 	bytes = sizeof(*sc->ss) * sc->num_slices;
3743 	sc->ss = kmalloc_cachealign(bytes, M_DEVBUF, M_WAITOK | M_ZERO);
3744 
3745 	for (i = 0; i < sc->num_slices; i++) {
3746 		ss = &sc->ss[i];
3747 
3748 		ss->sc = sc;
3749 
3750 		lwkt_serialize_init(&ss->rx_data.rx_serialize);
3751 		lwkt_serialize_init(&ss->tx.tx_serialize);
3752 
3753 		/*
3754 		 * Allocate per-slice rx interrupt queues
3755 		 * XXX assume 4bytes mcp_slot
3756 		 */
3757 		bytes = sc->rx_intr_slots * sizeof(mcp_slot_t);
3758 		err = mxge_dma_alloc(sc, &ss->rx_done_dma, bytes, 4096);
3759 		if (err != 0) {
3760 			device_printf(sc->dev,
3761 			    "alloc %d slice rx_done failed\n", i);
3762 			return err;
3763 		}
3764 		ss->rx_data.rx_done.entry = ss->rx_done_dma.dmem_addr;
3765 
3766 		/*
3767 		 * Allocate the per-slice firmware stats
3768 		 */
3769 		bytes = sizeof(*ss->fw_stats);
3770 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3771 		    sizeof(*ss->fw_stats), 64);
3772 		if (err != 0) {
3773 			device_printf(sc->dev,
3774 			    "alloc %d fw_stats failed\n", i);
3775 			return err;
3776 		}
3777 		ss->fw_stats = ss->fw_stats_dma.dmem_addr;
3778 	}
3779 	return 0;
3780 }
3781 
3782 static void
3783 mxge_slice_probe(mxge_softc_t *sc)
3784 {
3785 	mxge_cmd_t cmd;
3786 	const char *old_fw;
3787 	int msix_cnt, status, max_intr_slots;
3788 
3789 	sc->num_slices = 1;
3790 
3791 	/*
3792 	 * XXX
3793 	 *
3794 	 * Don't enable multiple slices if they are not enabled,
3795 	 * or if this is not an SMP system
3796 	 */
3797 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || ncpus < 2)
3798 		return;
3799 
3800 	/* see how many MSI-X interrupts are available */
3801 	msix_cnt = pci_msix_count(sc->dev);
3802 	if (msix_cnt < 2)
3803 		return;
3804 
3805 	/* now load the slice aware firmware see what it supports */
3806 	old_fw = sc->fw_name;
3807 	if (old_fw == mxge_fw_aligned)
3808 		sc->fw_name = mxge_fw_rss_aligned;
3809 	else
3810 		sc->fw_name = mxge_fw_rss_unaligned;
3811 	status = mxge_load_firmware(sc, 0);
3812 	if (status != 0) {
3813 		device_printf(sc->dev, "Falling back to a single slice\n");
3814 		return;
3815 	}
3816 
3817 	/* try to send a reset command to the card to see if it
3818 	   is alive */
3819 	memset(&cmd, 0, sizeof (cmd));
3820 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
3821 	if (status != 0) {
3822 		device_printf(sc->dev, "failed reset\n");
3823 		goto abort_with_fw;
3824 	}
3825 
3826 	/* get rx ring size */
3827 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3828 	if (status != 0) {
3829 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3830 		goto abort_with_fw;
3831 	}
3832 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
3833 
3834 	/* tell it the size of the interrupt queues */
3835 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
3836 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
3837 	if (status != 0) {
3838 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
3839 		goto abort_with_fw;
3840 	}
3841 
3842 	/* ask the maximum number of slices it supports */
3843 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
3844 	if (status != 0) {
3845 		device_printf(sc->dev,
3846 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
3847 		goto abort_with_fw;
3848 	}
3849 	sc->num_slices = cmd.data0;
3850 	if (sc->num_slices > msix_cnt)
3851 		sc->num_slices = msix_cnt;
3852 
3853 	if (mxge_max_slices == -1) {
3854 		/* cap to number of CPUs in system */
3855 		if (sc->num_slices > ncpus)
3856 			sc->num_slices = ncpus;
3857 	} else {
3858 		if (sc->num_slices > mxge_max_slices)
3859 			sc->num_slices = mxge_max_slices;
3860 	}
3861 	/* make sure it is a power of two */
3862 	while (sc->num_slices & (sc->num_slices - 1))
3863 		sc->num_slices--;
3864 
3865 	if (bootverbose)
3866 		device_printf(sc->dev, "using %d slices\n",
3867 			      sc->num_slices);
3868 
3869 	return;
3870 
3871 abort_with_fw:
3872 	sc->fw_name = old_fw;
3873 	(void) mxge_load_firmware(sc, 0);
3874 }
3875 
3876 #if 0
3877 static int
3878 mxge_add_msix_irqs(mxge_softc_t *sc)
3879 {
3880 	size_t bytes;
3881 	int count, err, i, rid;
3882 
3883 	rid = PCIR_BAR(2);
3884 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
3885 						    &rid, RF_ACTIVE);
3886 
3887 	if (sc->msix_table_res == NULL) {
3888 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
3889 		return ENXIO;
3890 	}
3891 
3892 	count = sc->num_slices;
3893 	err = pci_alloc_msix(sc->dev, &count);
3894 	if (err != 0) {
3895 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
3896 			      "err = %d \n", sc->num_slices, err);
3897 		goto abort_with_msix_table;
3898 	}
3899 	if (count < sc->num_slices) {
3900 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
3901 			      count, sc->num_slices);
3902 		device_printf(sc->dev,
3903 			      "Try setting hw.mxge.max_slices to %d\n",
3904 			      count);
3905 		err = ENOSPC;
3906 		goto abort_with_msix;
3907 	}
3908 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
3909 	sc->msix_irq_res = kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3910 	if (sc->msix_irq_res == NULL) {
3911 		err = ENOMEM;
3912 		goto abort_with_msix;
3913 	}
3914 
3915 	for (i = 0; i < sc->num_slices; i++) {
3916 		rid = i + 1;
3917 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
3918 							  SYS_RES_IRQ,
3919 							  &rid, RF_ACTIVE);
3920 		if (sc->msix_irq_res[i] == NULL) {
3921 			device_printf(sc->dev, "couldn't allocate IRQ res"
3922 				      " for message %d\n", i);
3923 			err = ENXIO;
3924 			goto abort_with_res;
3925 		}
3926 	}
3927 
3928 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
3929 	sc->msix_ih =  kmalloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3930 
3931 	for (i = 0; i < sc->num_slices; i++) {
3932 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
3933 				     INTR_MPSAFE,
3934 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i],
3935 				     sc->ifp->if_serializer);
3936 		if (err != 0) {
3937 			device_printf(sc->dev, "couldn't setup intr for "
3938 				      "message %d\n", i);
3939 			goto abort_with_intr;
3940 		}
3941 	}
3942 
3943 	if (bootverbose) {
3944 		device_printf(sc->dev, "using %d msix IRQs:",
3945 			      sc->num_slices);
3946 		for (i = 0; i < sc->num_slices; i++)
3947 			kprintf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
3948 		kprintf("\n");
3949 	}
3950 	return (0);
3951 
3952 abort_with_intr:
3953 	for (i = 0; i < sc->num_slices; i++) {
3954 		if (sc->msix_ih[i] != NULL) {
3955 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
3956 					  sc->msix_ih[i]);
3957 			sc->msix_ih[i] = NULL;
3958 		}
3959 	}
3960 	kfree(sc->msix_ih, M_DEVBUF);
3961 
3962 
3963 abort_with_res:
3964 	for (i = 0; i < sc->num_slices; i++) {
3965 		rid = i + 1;
3966 		if (sc->msix_irq_res[i] != NULL)
3967 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
3968 					     sc->msix_irq_res[i]);
3969 		sc->msix_irq_res[i] = NULL;
3970 	}
3971 	kfree(sc->msix_irq_res, M_DEVBUF);
3972 
3973 
3974 abort_with_msix:
3975 	pci_release_msi(sc->dev);
3976 
3977 abort_with_msix_table:
3978 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
3979 			     sc->msix_table_res);
3980 
3981 	return err;
3982 }
3983 #endif
3984 
3985 static int
3986 mxge_add_single_irq(mxge_softc_t *sc)
3987 {
3988 	driver_intr_t *intr_func;
3989 	u_int irq_flags;
3990 
3991 	sc->irq_type = pci_alloc_1intr(sc->dev, mxge_msi_enable,
3992 	    &sc->irq_rid, &irq_flags);
3993 
3994 	sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
3995 	    &sc->irq_rid, irq_flags);
3996 	if (sc->irq_res == NULL) {
3997 		device_printf(sc->dev, "could not alloc interrupt\n");
3998 		return ENXIO;
3999 	}
4000 
4001 	if (sc->irq_type == PCI_INTR_TYPE_LEGACY)
4002 		intr_func = mxge_legacy;
4003 	else
4004 		intr_func = mxge_msi;
4005 
4006 	return bus_setup_intr(sc->dev, sc->irq_res, INTR_MPSAFE,
4007 	    intr_func, &sc->ss[0], &sc->ih, &sc->main_serialize);
4008 }
4009 
4010 #if 0
4011 static void
4012 mxge_rem_msix_irqs(mxge_softc_t *sc)
4013 {
4014 	int i, rid;
4015 
4016 	for (i = 0; i < sc->num_slices; i++) {
4017 		if (sc->msix_ih[i] != NULL) {
4018 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4019 					  sc->msix_ih[i]);
4020 			sc->msix_ih[i] = NULL;
4021 		}
4022 	}
4023 	kfree(sc->msix_ih, M_DEVBUF);
4024 
4025 	for (i = 0; i < sc->num_slices; i++) {
4026 		rid = i + 1;
4027 		if (sc->msix_irq_res[i] != NULL)
4028 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4029 					     sc->msix_irq_res[i]);
4030 		sc->msix_irq_res[i] = NULL;
4031 	}
4032 	kfree(sc->msix_irq_res, M_DEVBUF);
4033 
4034 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4035 			     sc->msix_table_res);
4036 
4037 	pci_release_msi(sc->dev);
4038 	return;
4039 }
4040 #endif
4041 
4042 static int
4043 mxge_add_irq(mxge_softc_t *sc)
4044 {
4045 #if 0
4046 	int err;
4047 
4048 	if (sc->num_slices > 1)
4049 		err = mxge_add_msix_irqs(sc);
4050 	else
4051 		err = mxge_add_single_irq(sc);
4052 
4053 	if (0 && err == 0 && sc->num_slices > 1) {
4054 		mxge_rem_msix_irqs(sc);
4055 		err = mxge_add_msix_irqs(sc);
4056 	}
4057 	return err;
4058 #else
4059 	return mxge_add_single_irq(sc);
4060 #endif
4061 }
4062 
4063 static void
4064 mxge_setup_serialize(struct mxge_softc *sc)
4065 {
4066 	int i = 0, slice;
4067 
4068 	/* Main + rx + tx */
4069 	sc->nserialize = (2 * sc->num_slices) + 1;
4070 	sc->serializes =
4071 	    kmalloc(sc->nserialize * sizeof(struct lwkt_serialize *),
4072 	        M_DEVBUF, M_WAITOK | M_ZERO);
4073 
4074 	/*
4075 	 * Setup serializes
4076 	 *
4077 	 * NOTE: Order is critical
4078 	 */
4079 
4080 	KKASSERT(i < sc->nserialize);
4081 	sc->serializes[i++] = &sc->main_serialize;
4082 
4083 	for (slice = 0; slice < sc->num_slices; ++slice) {
4084 		KKASSERT(i < sc->nserialize);
4085 		sc->serializes[i++] = &sc->ss[slice].rx_data.rx_serialize;
4086 	}
4087 
4088 	for (slice = 0; slice < sc->num_slices; ++slice) {
4089 		KKASSERT(i < sc->nserialize);
4090 		sc->serializes[i++] = &sc->ss[slice].tx.tx_serialize;
4091 	}
4092 
4093 	KKASSERT(i == sc->nserialize);
4094 }
4095 
4096 static void
4097 mxge_serialize(struct ifnet *ifp, enum ifnet_serialize slz)
4098 {
4099 	struct mxge_softc *sc = ifp->if_softc;
4100 
4101 	ifnet_serialize_array_enter(sc->serializes, sc->nserialize, slz);
4102 }
4103 
4104 static void
4105 mxge_deserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4106 {
4107 	struct mxge_softc *sc = ifp->if_softc;
4108 
4109 	ifnet_serialize_array_exit(sc->serializes, sc->nserialize, slz);
4110 }
4111 
4112 static int
4113 mxge_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4114 {
4115 	struct mxge_softc *sc = ifp->if_softc;
4116 
4117 	return ifnet_serialize_array_try(sc->serializes, sc->nserialize, slz);
4118 }
4119 
4120 #ifdef INVARIANTS
4121 
4122 static void
4123 mxge_serialize_assert(struct ifnet *ifp, enum ifnet_serialize slz,
4124     boolean_t serialized)
4125 {
4126 	struct mxge_softc *sc = ifp->if_softc;
4127 
4128 	ifnet_serialize_array_assert(sc->serializes, sc->nserialize,
4129 	    slz, serialized);
4130 }
4131 
4132 #endif	/* INVARIANTS */
4133 
4134 static int
4135 mxge_attach(device_t dev)
4136 {
4137 	mxge_softc_t *sc = device_get_softc(dev);
4138 	struct ifnet *ifp = &sc->arpcom.ac_if;
4139 	int err, rid;
4140 
4141 	/*
4142 	 * Avoid rewriting half the lines in this file to use
4143 	 * &sc->arpcom.ac_if instead
4144 	 */
4145 	sc->ifp = ifp;
4146 	sc->dev = dev;
4147 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4148 	ifmedia_init(&sc->media, 0, mxge_media_change, mxge_media_status);
4149 
4150 	lwkt_serialize_init(&sc->main_serialize);
4151 
4152 	mxge_fetch_tunables(sc);
4153 
4154 	err = bus_dma_tag_create(NULL,			/* parent */
4155 				 1,			/* alignment */
4156 				 0,			/* boundary */
4157 				 BUS_SPACE_MAXADDR,	/* low */
4158 				 BUS_SPACE_MAXADDR,	/* high */
4159 				 NULL, NULL,		/* filter */
4160 				 BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
4161 				 0, 			/* num segs */
4162 				 BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
4163 				 0,			/* flags */
4164 				 &sc->parent_dmat);	/* tag */
4165 	if (err != 0) {
4166 		device_printf(dev, "Err %d allocating parent dmat\n", err);
4167 		goto failed;
4168 	}
4169 
4170 	callout_init_mp(&sc->co_hdl);
4171 
4172 	mxge_setup_cfg_space(sc);
4173 
4174 	/*
4175 	 * Map the board into the kernel
4176 	 */
4177 	rid = PCIR_BARS;
4178 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
4179 	    &rid, RF_ACTIVE);
4180 	if (sc->mem_res == NULL) {
4181 		device_printf(dev, "could not map memory\n");
4182 		err = ENXIO;
4183 		goto failed;
4184 	}
4185 
4186 	sc->sram = rman_get_virtual(sc->mem_res);
4187 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4188 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4189 		device_printf(dev, "impossible memory region size %ld\n",
4190 		    rman_get_size(sc->mem_res));
4191 		err = ENXIO;
4192 		goto failed;
4193 	}
4194 
4195 	/*
4196 	 * Make NULL terminated copy of the EEPROM strings section of
4197 	 * lanai SRAM
4198 	 */
4199 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4200 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4201 	    rman_get_bushandle(sc->mem_res),
4202 	    sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4203 	    sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE - 2);
4204 	err = mxge_parse_strings(sc);
4205 	if (err != 0) {
4206 		device_printf(dev, "parse EEPROM string failed\n");
4207 		goto failed;
4208 	}
4209 
4210 	/*
4211 	 * Enable write combining for efficient use of PCIe bus
4212 	 */
4213 	mxge_enable_wc(sc);
4214 
4215 	/*
4216 	 * Allocate the out of band DMA memory
4217 	 */
4218 	err = mxge_dma_alloc(sc, &sc->cmd_dma, sizeof(mxge_cmd_t), 64);
4219 	if (err != 0) {
4220 		device_printf(dev, "alloc cmd DMA buf failed\n");
4221 		goto failed;
4222 	}
4223 	sc->cmd = sc->cmd_dma.dmem_addr;
4224 
4225 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4226 	if (err != 0) {
4227 		device_printf(dev, "alloc zeropad DMA buf failed\n");
4228 		goto failed;
4229 	}
4230 
4231 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4232 	if (err != 0) {
4233 		device_printf(dev, "alloc dmabench DMA buf failed\n");
4234 		goto failed;
4235 	}
4236 
4237 	/* Select & load the firmware */
4238 	err = mxge_select_firmware(sc);
4239 	if (err != 0) {
4240 		device_printf(dev, "select firmware failed\n");
4241 		goto failed;
4242 	}
4243 
4244 	mxge_slice_probe(sc);
4245 	err = mxge_alloc_slices(sc);
4246 	if (err != 0) {
4247 		device_printf(dev, "alloc slices failed\n");
4248 		goto failed;
4249 	}
4250 
4251 	/* Setup serializes */
4252 	mxge_setup_serialize(sc);
4253 
4254 	err = mxge_reset(sc, 0);
4255 	if (err != 0) {
4256 		device_printf(dev, "reset failed\n");
4257 		goto failed;
4258 	}
4259 
4260 	err = mxge_alloc_rings(sc);
4261 	if (err != 0) {
4262 		device_printf(dev, "failed to allocate rings\n");
4263 		goto failed;
4264 	}
4265 
4266 	ifp->if_baudrate = IF_Gbps(10UL);
4267 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO;
4268 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4269 
4270 	ifp->if_capabilities |= IFCAP_VLAN_MTU;
4271 #if 0
4272 	/* Well, its software, sigh */
4273 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
4274 #endif
4275 	ifp->if_capenable = ifp->if_capabilities;
4276 
4277 	ifp->if_softc = sc;
4278 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4279 	ifp->if_init = mxge_init;
4280 	ifp->if_ioctl = mxge_ioctl;
4281 	ifp->if_start = mxge_start;
4282 	ifp->if_watchdog = mxge_watchdog;
4283 	ifp->if_serialize = mxge_serialize;
4284 	ifp->if_deserialize = mxge_deserialize;
4285 	ifp->if_tryserialize = mxge_tryserialize;
4286 #ifdef INVARIANTS
4287 	ifp->if_serialize_assert = mxge_serialize_assert;
4288 #endif
4289 
4290 	/* Increase TSO burst length */
4291 	ifp->if_tsolen = (32 * ETHERMTU);
4292 
4293 	/* Initialise the ifmedia structure */
4294 	mxge_media_init(sc);
4295 	mxge_media_probe(sc);
4296 
4297 	ether_ifattach(ifp, sc->mac_addr, NULL);
4298 
4299 	/*
4300 	 * XXX
4301 	 * We are not ready to do "gather" jumbo frame, so
4302 	 * limit MTU to MJUMPAGESIZE
4303 	 */
4304 	sc->max_mtu = MJUMPAGESIZE -
4305 	    ETHER_HDR_LEN - EVL_ENCAPLEN - MXGEFW_PAD - 1;
4306 	sc->dying = 0;
4307 
4308 	/* must come after ether_ifattach() */
4309 	err = mxge_add_irq(sc);
4310 	if (err != 0) {
4311 		device_printf(dev, "alloc and setup intr failed\n");
4312 		ether_ifdetach(ifp);
4313 		goto failed;
4314 	}
4315 
4316 	ifq_set_cpuid(&ifp->if_snd, rman_get_cpuid(sc->irq_res));
4317 	ifq_set_hw_serialize(&ifp->if_snd, &sc->ss[0].tx.tx_serialize);
4318 
4319 	mxge_add_sysctls(sc);
4320 
4321 	callout_reset_bycpu(&sc->co_hdl, mxge_ticks, mxge_tick, sc,
4322 	    rman_get_cpuid(sc->irq_res));
4323 	return 0;
4324 
4325 failed:
4326 	mxge_detach(dev);
4327 	return err;
4328 }
4329 
4330 static int
4331 mxge_detach(device_t dev)
4332 {
4333 	mxge_softc_t *sc = device_get_softc(dev);
4334 
4335 	if (device_is_attached(dev)) {
4336 		struct ifnet *ifp = sc->ifp;
4337 
4338 		ifnet_serialize_all(ifp);
4339 
4340 		sc->dying = 1;
4341 		if (ifp->if_flags & IFF_RUNNING)
4342 			mxge_close(sc, 1);
4343 		callout_stop(&sc->co_hdl);
4344 
4345 		bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4346 
4347 		ifnet_deserialize_all(ifp);
4348 
4349 		callout_terminate(&sc->co_hdl);
4350 
4351 		ether_ifdetach(ifp);
4352 	}
4353 	ifmedia_removeall(&sc->media);
4354 
4355 	if (sc->cmd != NULL && sc->zeropad_dma.dmem_addr != NULL &&
4356 	    sc->sram != NULL)
4357 		mxge_dummy_rdma(sc, 0);
4358 
4359 	mxge_rem_sysctls(sc);
4360 	mxge_free_rings(sc);
4361 
4362 	/* MUST after sysctls and rings are freed */
4363 	mxge_free_slices(sc);
4364 
4365 	if (sc->dmabench_dma.dmem_addr != NULL)
4366 		mxge_dma_free(&sc->dmabench_dma);
4367 	if (sc->zeropad_dma.dmem_addr != NULL)
4368 		mxge_dma_free(&sc->zeropad_dma);
4369 	if (sc->cmd_dma.dmem_addr != NULL)
4370 		mxge_dma_free(&sc->cmd_dma);
4371 
4372 	if (sc->irq_res != NULL) {
4373 		bus_release_resource(dev, SYS_RES_IRQ, sc->irq_rid,
4374 		    sc->irq_res);
4375 	}
4376 	if (sc->irq_type == PCI_INTR_TYPE_MSI)
4377 		pci_release_msi(dev);
4378 
4379 	if (sc->mem_res != NULL) {
4380 		bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS,
4381 		    sc->mem_res);
4382 	}
4383 
4384 	if (sc->parent_dmat != NULL)
4385 		bus_dma_tag_destroy(sc->parent_dmat);
4386 
4387 	return 0;
4388 }
4389 
4390 static int
4391 mxge_shutdown(device_t dev)
4392 {
4393 	return 0;
4394 }
4395