ixgbe_netmap.h revision 270252
1/* 2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26/* 27 * $FreeBSD: stable/10/sys/dev/netmap/ixgbe_netmap.h 270252 2014-08-20 23:34:36Z luigi $ 28 * 29 * netmap support for: ixgbe 30 * 31 * This file is meant to be a reference on how to implement 32 * netmap support for a network driver. 33 * This file contains code but only static or inline functions used 34 * by a single driver. To avoid replication of code we just #include 35 * it near the beginning of the standard driver. 36 */ 37 38 39#include <net/netmap.h> 40#include <sys/selinfo.h> 41/* 42 * Some drivers may need the following headers. Others 43 * already include them by default 44 45#include <vm/vm.h> 46#include <vm/pmap.h> 47 48 */ 49#include <dev/netmap/netmap_kern.h> 50 51 52/* 53 * device-specific sysctl variables: 54 * 55 * ix_crcstrip: 0: keep CRC in rx frames (default), 1: strip it. 56 * During regular operations the CRC is stripped, but on some 57 * hardware reception of frames not multiple of 64 is slower, 58 * so using crcstrip=0 helps in benchmarks. 59 * 60 * ix_rx_miss, ix_rx_miss_bufs: 61 * count packets that might be missed due to lost interrupts. 62 */ 63SYSCTL_DECL(_dev_netmap); 64static int ix_rx_miss, ix_rx_miss_bufs, ix_crcstrip; 65SYSCTL_INT(_dev_netmap, OID_AUTO, ix_crcstrip, 66 CTLFLAG_RW, &ix_crcstrip, 0, "strip CRC on rx frames"); 67SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss, 68 CTLFLAG_RW, &ix_rx_miss, 0, "potentially missed rx intr"); 69SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss_bufs, 70 CTLFLAG_RW, &ix_rx_miss_bufs, 0, "potentially missed rx intr bufs"); 71 72 73static void 74set_crcstrip(struct ixgbe_hw *hw, int onoff) 75{ 76 /* crc stripping is set in two places: 77 * IXGBE_HLREG0 (modified on init_locked and hw reset) 78 * IXGBE_RDRXCTL (set by the original driver in 79 * ixgbe_setup_hw_rsc() called in init_locked. 80 * We disable the setting when netmap is compiled in). 81 * We update the values here, but also in ixgbe.c because 82 * init_locked sometimes is called outside our control. 83 */ 84 uint32_t hl, rxc; 85 86 hl = IXGBE_READ_REG(hw, IXGBE_HLREG0); 87 rxc = IXGBE_READ_REG(hw, IXGBE_RDRXCTL); 88 if (netmap_verbose) 89 D("%s read HLREG 0x%x rxc 0x%x", 90 onoff ? "enter" : "exit", hl, rxc); 91 /* hw requirements ... */ 92 rxc &= ~IXGBE_RDRXCTL_RSCFRSTSIZE; 93 rxc |= IXGBE_RDRXCTL_RSCACKC; 94 if (onoff && !ix_crcstrip) { 95 /* keep the crc. Fast rx */ 96 hl &= ~IXGBE_HLREG0_RXCRCSTRP; 97 rxc &= ~IXGBE_RDRXCTL_CRCSTRIP; 98 } else { 99 /* reset default mode */ 100 hl |= IXGBE_HLREG0_RXCRCSTRP; 101 rxc |= IXGBE_RDRXCTL_CRCSTRIP; 102 } 103 if (netmap_verbose) 104 D("%s write HLREG 0x%x rxc 0x%x", 105 onoff ? "enter" : "exit", hl, rxc); 106 IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hl); 107 IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rxc); 108} 109 110 111/* 112 * Register/unregister. We are already under netmap lock. 113 * Only called on the first register or the last unregister. 114 */ 115static int 116ixgbe_netmap_reg(struct netmap_adapter *na, int onoff) 117{ 118 struct ifnet *ifp = na->ifp; 119 struct adapter *adapter = ifp->if_softc; 120 121 IXGBE_CORE_LOCK(adapter); 122 ixgbe_disable_intr(adapter); // XXX maybe ixgbe_stop ? 123 124 /* Tell the stack that the interface is no longer active */ 125 ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); 126 127 set_crcstrip(&adapter->hw, onoff); 128 /* enable or disable flags and callbacks in na and ifp */ 129 if (onoff) { 130 nm_set_native_flags(na); 131 } else { 132 nm_clear_native_flags(na); 133 } 134 ixgbe_init_locked(adapter); /* also enables intr */ 135 set_crcstrip(&adapter->hw, onoff); // XXX why twice ? 136 IXGBE_CORE_UNLOCK(adapter); 137 return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); 138} 139 140 141/* 142 * Reconcile kernel and user view of the transmit ring. 143 * 144 * All information is in the kring. 145 * Userspace wants to send packets up to the one before kring->rhead, 146 * kernel knows kring->nr_hwcur is the first unsent packet. 147 * 148 * Here we push packets out (as many as possible), and possibly 149 * reclaim buffers from previously completed transmission. 150 * 151 * The caller (netmap) guarantees that there is only one instance 152 * running at any time. Any interference with other driver 153 * methods should be handled by the individual drivers. 154 */ 155static int 156ixgbe_netmap_txsync(struct netmap_kring *kring, int flags) 157{ 158 struct netmap_adapter *na = kring->na; 159 struct ifnet *ifp = na->ifp; 160 struct netmap_ring *ring = kring->ring; 161 u_int nm_i; /* index into the netmap ring */ 162 u_int nic_i; /* index into the NIC ring */ 163 u_int n; 164 u_int const lim = kring->nkr_num_slots - 1; 165 u_int const head = kring->rhead; 166 /* 167 * interrupts on every tx packet are expensive so request 168 * them every half ring, or where NS_REPORT is set 169 */ 170 u_int report_frequency = kring->nkr_num_slots >> 1; 171 172 /* device-specific */ 173 struct adapter *adapter = ifp->if_softc; 174 struct tx_ring *txr = &adapter->tx_rings[kring->ring_id]; 175 int reclaim_tx; 176 177 bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, 178 BUS_DMASYNC_POSTREAD); 179 180 /* 181 * First part: process new packets to send. 182 * nm_i is the current index in the netmap ring, 183 * nic_i is the corresponding index in the NIC ring. 184 * The two numbers differ because upon a *_init() we reset 185 * the NIC ring but leave the netmap ring unchanged. 186 * For the transmit ring, we have 187 * 188 * nm_i = kring->nr_hwcur 189 * nic_i = IXGBE_TDT (not tracked in the driver) 190 * and 191 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size 192 * 193 * In this driver kring->nkr_hwofs >= 0, but for other 194 * drivers it might be negative as well. 195 */ 196 197 /* 198 * If we have packets to send (kring->nr_hwcur != kring->rhead) 199 * iterate over the netmap ring, fetch length and update 200 * the corresponding slot in the NIC ring. Some drivers also 201 * need to update the buffer's physical address in the NIC slot 202 * even NS_BUF_CHANGED is not set (PNMB computes the addresses). 203 * 204 * The netmap_reload_map() calls is especially expensive, 205 * even when (as in this case) the tag is 0, so do only 206 * when the buffer has actually changed. 207 * 208 * If possible do not set the report/intr bit on all slots, 209 * but only a few times per ring or when NS_REPORT is set. 210 * 211 * Finally, on 10G and faster drivers, it might be useful 212 * to prefetch the next slot and txr entry. 213 */ 214 215 nm_i = kring->nr_hwcur; 216 if (nm_i != head) { /* we have new packets to send */ 217 nic_i = netmap_idx_k2n(kring, nm_i); 218 219 __builtin_prefetch(&ring->slot[nm_i]); 220 __builtin_prefetch(&txr->tx_buffers[nic_i]); 221 222 for (n = 0; nm_i != head; n++) { 223 struct netmap_slot *slot = &ring->slot[nm_i]; 224 u_int len = slot->len; 225 uint64_t paddr; 226 void *addr = PNMB(na, slot, &paddr); 227 228 /* device-specific */ 229 union ixgbe_adv_tx_desc *curr = &txr->tx_base[nic_i]; 230 struct ixgbe_tx_buf *txbuf = &txr->tx_buffers[nic_i]; 231 int flags = (slot->flags & NS_REPORT || 232 nic_i == 0 || nic_i == report_frequency) ? 233 IXGBE_TXD_CMD_RS : 0; 234 235 /* prefetch for next round */ 236 __builtin_prefetch(&ring->slot[nm_i + 1]); 237 __builtin_prefetch(&txr->tx_buffers[nic_i + 1]); 238 239 NM_CHECK_ADDR_LEN(na, addr, len); 240 241 if (slot->flags & NS_BUF_CHANGED) { 242 /* buffer has changed, reload map */ 243 netmap_reload_map(na, txr->txtag, txbuf->map, addr); 244 } 245 slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); 246 247 /* Fill the slot in the NIC ring. */ 248 /* Use legacy descriptor, they are faster? */ 249 curr->read.buffer_addr = htole64(paddr); 250 curr->read.olinfo_status = 0; 251 curr->read.cmd_type_len = htole32(len | flags | 252 IXGBE_ADVTXD_DCMD_IFCS | IXGBE_TXD_CMD_EOP); 253 254 /* make sure changes to the buffer are synced */ 255 bus_dmamap_sync(txr->txtag, txbuf->map, 256 BUS_DMASYNC_PREWRITE); 257 258 nm_i = nm_next(nm_i, lim); 259 nic_i = nm_next(nic_i, lim); 260 } 261 kring->nr_hwcur = head; 262 263 /* synchronize the NIC ring */ 264 bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, 265 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); 266 267 /* (re)start the tx unit up to slot nic_i (excluded) */ 268 IXGBE_WRITE_REG(&adapter->hw, IXGBE_TDT(txr->me), nic_i); 269 } 270 271 /* 272 * Second part: reclaim buffers for completed transmissions. 273 * Because this is expensive (we read a NIC register etc.) 274 * we only do it in specific cases (see below). 275 */ 276 if (flags & NAF_FORCE_RECLAIM) { 277 reclaim_tx = 1; /* forced reclaim */ 278 } else if (!nm_kr_txempty(kring)) { 279 reclaim_tx = 0; /* have buffers, no reclaim */ 280 } else { 281 /* 282 * No buffers available. Locate previous slot with 283 * REPORT_STATUS set. 284 * If the slot has DD set, we can reclaim space, 285 * otherwise wait for the next interrupt. 286 * This enables interrupt moderation on the tx 287 * side though it might reduce throughput. 288 */ 289 struct ixgbe_legacy_tx_desc *txd = 290 (struct ixgbe_legacy_tx_desc *)txr->tx_base; 291 292 nic_i = txr->next_to_clean + report_frequency; 293 if (nic_i > lim) 294 nic_i -= lim + 1; 295 // round to the closest with dd set 296 nic_i = (nic_i < kring->nkr_num_slots / 4 || 297 nic_i >= kring->nkr_num_slots*3/4) ? 298 0 : report_frequency; 299 reclaim_tx = txd[nic_i].upper.fields.status & IXGBE_TXD_STAT_DD; // XXX cpu_to_le32 ? 300 } 301 if (reclaim_tx) { 302 /* 303 * Record completed transmissions. 304 * We (re)use the driver's txr->next_to_clean to keep 305 * track of the most recently completed transmission. 306 * 307 * The datasheet discourages the use of TDH to find 308 * out the number of sent packets, but we only set 309 * REPORT_STATUS in a few slots so TDH is the only 310 * good way. 311 */ 312 nic_i = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(kring->ring_id)); 313 if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ 314 D("TDH wrap %d", nic_i); 315 nic_i -= kring->nkr_num_slots; 316 } 317 if (nic_i != txr->next_to_clean) { 318 /* some tx completed, increment avail */ 319 txr->next_to_clean = nic_i; 320 kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); 321 } 322 } 323 324 nm_txsync_finalize(kring); 325 326 return 0; 327} 328 329 330/* 331 * Reconcile kernel and user view of the receive ring. 332 * Same as for the txsync, this routine must be efficient. 333 * The caller guarantees a single invocations, but races against 334 * the rest of the driver should be handled here. 335 * 336 * On call, kring->rhead is the first packet that userspace wants 337 * to keep, and kring->rcur is the wakeup point. 338 * The kernel has previously reported packets up to kring->rtail. 339 * 340 * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective 341 * of whether or not we received an interrupt. 342 */ 343static int 344ixgbe_netmap_rxsync(struct netmap_kring *kring, int flags) 345{ 346 struct netmap_adapter *na = kring->na; 347 struct ifnet *ifp = na->ifp; 348 struct netmap_ring *ring = kring->ring; 349 u_int nm_i; /* index into the netmap ring */ 350 u_int nic_i; /* index into the NIC ring */ 351 u_int n; 352 u_int const lim = kring->nkr_num_slots - 1; 353 u_int const head = nm_rxsync_prologue(kring); 354 int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; 355 356 /* device-specific */ 357 struct adapter *adapter = ifp->if_softc; 358 struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id]; 359 360 if (head > lim) 361 return netmap_ring_reinit(kring); 362 363 /* XXX check sync modes */ 364 bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, 365 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); 366 367 /* 368 * First part: import newly received packets. 369 * 370 * nm_i is the index of the next free slot in the netmap ring, 371 * nic_i is the index of the next received packet in the NIC ring, 372 * and they may differ in case if_init() has been called while 373 * in netmap mode. For the receive ring we have 374 * 375 * nic_i = rxr->next_to_check; 376 * nm_i = kring->nr_hwtail (previous) 377 * and 378 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size 379 * 380 * rxr->next_to_check is set to 0 on a ring reinit 381 */ 382 if (netmap_no_pendintr || force_update) { 383 int crclen = ix_crcstrip ? 0 : 4; 384 uint16_t slot_flags = kring->nkr_slot_flags; 385 386 nic_i = rxr->next_to_check; // or also k2n(kring->nr_hwtail) 387 nm_i = netmap_idx_n2k(kring, nic_i); 388 389 for (n = 0; ; n++) { 390 union ixgbe_adv_rx_desc *curr = &rxr->rx_base[nic_i]; 391 uint32_t staterr = le32toh(curr->wb.upper.status_error); 392 393 if ((staterr & IXGBE_RXD_STAT_DD) == 0) 394 break; 395 ring->slot[nm_i].len = le16toh(curr->wb.upper.length) - crclen; 396 ring->slot[nm_i].flags = slot_flags; 397 bus_dmamap_sync(rxr->ptag, 398 rxr->rx_buffers[nic_i].pmap, BUS_DMASYNC_POSTREAD); 399 nm_i = nm_next(nm_i, lim); 400 nic_i = nm_next(nic_i, lim); 401 } 402 if (n) { /* update the state variables */ 403 if (netmap_no_pendintr && !force_update) { 404 /* diagnostics */ 405 ix_rx_miss ++; 406 ix_rx_miss_bufs += n; 407 } 408 rxr->next_to_check = nic_i; 409 kring->nr_hwtail = nm_i; 410 } 411 kring->nr_kflags &= ~NKR_PENDINTR; 412 } 413 414 /* 415 * Second part: skip past packets that userspace has released. 416 * (kring->nr_hwcur to kring->rhead excluded), 417 * and make the buffers available for reception. 418 * As usual nm_i is the index in the netmap ring, 419 * nic_i is the index in the NIC ring, and 420 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size 421 */ 422 nm_i = kring->nr_hwcur; 423 if (nm_i != head) { 424 nic_i = netmap_idx_k2n(kring, nm_i); 425 for (n = 0; nm_i != head; n++) { 426 struct netmap_slot *slot = &ring->slot[nm_i]; 427 uint64_t paddr; 428 void *addr = PNMB(na, slot, &paddr); 429 430 union ixgbe_adv_rx_desc *curr = &rxr->rx_base[nic_i]; 431 struct ixgbe_rx_buf *rxbuf = &rxr->rx_buffers[nic_i]; 432 433 if (addr == NETMAP_BUF_BASE(na)) /* bad buf */ 434 goto ring_reset; 435 436 if (slot->flags & NS_BUF_CHANGED) { 437 /* buffer has changed, reload map */ 438 netmap_reload_map(na, rxr->ptag, rxbuf->pmap, addr); 439 slot->flags &= ~NS_BUF_CHANGED; 440 } 441 curr->wb.upper.status_error = 0; 442 curr->read.pkt_addr = htole64(paddr); 443 bus_dmamap_sync(rxr->ptag, rxbuf->pmap, 444 BUS_DMASYNC_PREREAD); 445 nm_i = nm_next(nm_i, lim); 446 nic_i = nm_next(nic_i, lim); 447 } 448 kring->nr_hwcur = head; 449 450 bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, 451 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); 452 /* 453 * IMPORTANT: we must leave one free slot in the ring, 454 * so move nic_i back by one unit 455 */ 456 nic_i = nm_prev(nic_i, lim); 457 IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), nic_i); 458 } 459 460 /* tell userspace that there might be new packets */ 461 nm_rxsync_finalize(kring); 462 463 return 0; 464 465ring_reset: 466 return netmap_ring_reinit(kring); 467} 468 469 470/* 471 * The attach routine, called near the end of ixgbe_attach(), 472 * fills the parameters for netmap_attach() and calls it. 473 * It cannot fail, in the worst case (such as no memory) 474 * netmap mode will be disabled and the driver will only 475 * operate in standard mode. 476 */ 477static void 478ixgbe_netmap_attach(struct adapter *adapter) 479{ 480 struct netmap_adapter na; 481 482 bzero(&na, sizeof(na)); 483 484 na.ifp = adapter->ifp; 485 na.na_flags = NAF_BDG_MAYSLEEP; 486 na.num_tx_desc = adapter->num_tx_desc; 487 na.num_rx_desc = adapter->num_rx_desc; 488 na.nm_txsync = ixgbe_netmap_txsync; 489 na.nm_rxsync = ixgbe_netmap_rxsync; 490 na.nm_register = ixgbe_netmap_reg; 491 na.num_tx_rings = na.num_rx_rings = adapter->num_queues; 492 netmap_attach(&na); 493} 494 495/* end of file */ 496