1134411Ssimon/* 2134411Ssimon * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. 3134411Ssimon * 4134411Ssimon * Redistribution and use in source and binary forms, with or without 5134411Ssimon * modification, are permitted provided that the following conditions 6134411Ssimon * are met: 7134411Ssimon * 1. Redistributions of source code must retain the above copyright 8134411Ssimon * notice, this list of conditions and the following disclaimer. 9134411Ssimon * 2. Redistributions in binary form must reproduce the above copyright 10134411Ssimon * notice, this list of conditions and the following disclaimer in the 11134411Ssimon * documentation and/or other materials provided with the distribution. 12134411Ssimon * 13134411Ssimon * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14134411Ssimon * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15134411Ssimon * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16134411Ssimon * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17134411Ssimon * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18134411Ssimon * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19134411Ssimon * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20134411Ssimon * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21134411Ssimon * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22134411Ssimon * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23134411Ssimon * SUCH DAMAGE. 24134411Ssimon */ 25134411Ssimon 26134411Ssimon/* 27153459Sjoel * $FreeBSD$ 28134411Ssimon * 29134411Ssimon * netmap support for: ixgbe (both ix and ixv) 30134411Ssimon * 31134411Ssimon * This file is meant to be a reference on how to implement 32134411Ssimon * netmap support for a network driver. 33134411Ssimon * This file contains code but only static or inline functions used 34153459Sjoel * by a single driver. To avoid replication of code we just #include 35153459Sjoel * it near the beginning of the standard driver. 36153459Sjoel */ 37134938Sru 38134411Ssimon 39153459Sjoel#include <net/netmap.h> 40153459Sjoel#include <sys/selinfo.h> 41153459Sjoel/* 42153459Sjoel * Some drivers may need the following headers. Others 43153459Sjoel * already include them by default 44153459Sjoel 45153459Sjoel#include <vm/vm.h> 46153459Sjoel#include <vm/pmap.h> 47134411Ssimon 48134411Ssimon */ 49134411Ssimon#include <dev/netmap/netmap_kern.h> 50134411Ssimon 51134411Ssimonvoid ixgbe_netmap_attach(struct adapter *adapter); 52146489Sbrueffer 53134411Ssimon/* 54134411Ssimon * device-specific sysctl variables: 55134411Ssimon * 56146489Sbrueffer * ix_crcstrip: 0: keep CRC in rx frames (default), 1: strip it. 57134411Ssimon * During regular operations the CRC is stripped, but on some 58134411Ssimon * hardware reception of frames not multiple of 64 is slower, 59134411Ssimon * so using crcstrip=0 helps in benchmarks. 60134411Ssimon * 61134411Ssimon * ix_rx_miss, ix_rx_miss_bufs: 62134411Ssimon * count packets that might be missed due to lost interrupts. 63134411Ssimon */ 64134411SsimonSYSCTL_DECL(_dev_netmap); 65134411Ssimonstatic int ix_rx_miss, ix_rx_miss_bufs; 66134411Ssimonint ix_crcstrip; 67152895SjoelSYSCTL_INT(_dev_netmap, OID_AUTO, ix_crcstrip, 68152895Sjoel CTLFLAG_RW, &ix_crcstrip, 0, "strip CRC on rx frames"); 69134411SsimonSYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss, 70267938Sbapt CTLFLAG_RW, &ix_rx_miss, 0, "potentially missed rx intr"); 71SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss_bufs, 72 CTLFLAG_RW, &ix_rx_miss_bufs, 0, "potentially missed rx intr bufs"); 73 74 75static void 76set_crcstrip(struct ixgbe_hw *hw, int onoff) 77{ 78 /* crc stripping is set in two places: 79 * IXGBE_HLREG0 (modified on init_locked and hw reset) 80 * IXGBE_RDRXCTL (set by the original driver in 81 * ixgbe_setup_hw_rsc() called in init_locked. 82 * We disable the setting when netmap is compiled in). 83 * We update the values here, but also in ixgbe.c because 84 * init_locked sometimes is called outside our control. 85 */ 86 uint32_t hl, rxc; 87 88 hl = IXGBE_READ_REG(hw, IXGBE_HLREG0); 89 rxc = IXGBE_READ_REG(hw, IXGBE_RDRXCTL); 90 if (netmap_verbose) 91 D("%s read HLREG 0x%x rxc 0x%x", 92 onoff ? "enter" : "exit", hl, rxc); 93 /* hw requirements ... */ 94 rxc &= ~IXGBE_RDRXCTL_RSCFRSTSIZE; 95 rxc |= IXGBE_RDRXCTL_RSCACKC; 96 if (onoff && !ix_crcstrip) { 97 /* keep the crc. Fast rx */ 98 hl &= ~IXGBE_HLREG0_RXCRCSTRP; 99 rxc &= ~IXGBE_RDRXCTL_CRCSTRIP; 100 } else { 101 /* reset default mode */ 102 hl |= IXGBE_HLREG0_RXCRCSTRP; 103 rxc |= IXGBE_RDRXCTL_CRCSTRIP; 104 } 105 if (netmap_verbose) 106 D("%s write HLREG 0x%x rxc 0x%x", 107 onoff ? "enter" : "exit", hl, rxc); 108 IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hl); 109 IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rxc); 110} 111 112 113/* 114 * Register/unregister. We are already under netmap lock. 115 * Only called on the first register or the last unregister. 116 */ 117static int 118ixgbe_netmap_reg(struct netmap_adapter *na, int onoff) 119{ 120 struct ifnet *ifp = na->ifp; 121 struct adapter *adapter = ifp->if_softc; 122 123 IXGBE_CORE_LOCK(adapter); 124 ixgbe_disable_intr(adapter); // XXX maybe ixgbe_stop ? 125 126 if (!IXGBE_IS_VF(adapter)) 127 set_crcstrip(&adapter->hw, onoff); 128 /* enable or disable flags and callbacks in na and ifp */ 129 if (onoff) { 130 nm_set_native_flags(na); 131 } else { 132 nm_clear_native_flags(na); 133 } 134 ixgbe_init_locked(adapter); /* also enables intr */ 135 if (!IXGBE_IS_VF(adapter)) 136 set_crcstrip(&adapter->hw, onoff); // XXX why twice ? 137 IXGBE_CORE_UNLOCK(adapter); 138 return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); 139} 140 141 142/* 143 * Reconcile kernel and user view of the transmit ring. 144 * 145 * All information is in the kring. 146 * Userspace wants to send packets up to the one before kring->rhead, 147 * kernel knows kring->nr_hwcur is the first unsent packet. 148 * 149 * Here we push packets out (as many as possible), and possibly 150 * reclaim buffers from previously completed transmission. 151 * 152 * The caller (netmap) guarantees that there is only one instance 153 * running at any time. Any interference with other driver 154 * methods should be handled by the individual drivers. 155 */ 156static int 157ixgbe_netmap_txsync(struct netmap_kring *kring, int flags) 158{ 159 struct netmap_adapter *na = kring->na; 160 struct ifnet *ifp = na->ifp; 161 struct netmap_ring *ring = kring->ring; 162 u_int nm_i; /* index into the netmap ring */ 163 u_int nic_i; /* index into the NIC ring */ 164 u_int n; 165 u_int const lim = kring->nkr_num_slots - 1; 166 u_int const head = kring->rhead; 167 /* 168 * interrupts on every tx packet are expensive so request 169 * them every half ring, or where NS_REPORT is set 170 */ 171 u_int report_frequency = kring->nkr_num_slots >> 1; 172 173 /* device-specific */ 174 struct adapter *adapter = ifp->if_softc; 175 struct tx_ring *txr = &adapter->tx_rings[kring->ring_id]; 176 int reclaim_tx; 177 178 bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, 179 BUS_DMASYNC_POSTREAD); 180 181 /* 182 * First part: process new packets to send. 183 * nm_i is the current index in the netmap ring, 184 * nic_i is the corresponding index in the NIC ring. 185 * The two numbers differ because upon a *_init() we reset 186 * the NIC ring but leave the netmap ring unchanged. 187 * For the transmit ring, we have 188 * 189 * nm_i = kring->nr_hwcur 190 * nic_i = IXGBE_TDT (not tracked in the driver) 191 * and 192 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size 193 * 194 * In this driver kring->nkr_hwofs >= 0, but for other 195 * drivers it might be negative as well. 196 */ 197 198 /* 199 * If we have packets to send (kring->nr_hwcur != kring->rhead) 200 * iterate over the netmap ring, fetch length and update 201 * the corresponding slot in the NIC ring. Some drivers also 202 * need to update the buffer's physical address in the NIC slot 203 * even NS_BUF_CHANGED is not set (PNMB computes the addresses). 204 * 205 * The netmap_reload_map() calls is especially expensive, 206 * even when (as in this case) the tag is 0, so do only 207 * when the buffer has actually changed. 208 * 209 * If possible do not set the report/intr bit on all slots, 210 * but only a few times per ring or when NS_REPORT is set. 211 * 212 * Finally, on 10G and faster drivers, it might be useful 213 * to prefetch the next slot and txr entry. 214 */ 215 216 nm_i = kring->nr_hwcur; 217 if (nm_i != head) { /* we have new packets to send */ 218 nic_i = netmap_idx_k2n(kring, nm_i); 219 220 __builtin_prefetch(&ring->slot[nm_i]); 221 __builtin_prefetch(&txr->tx_buffers[nic_i]); 222 223 for (n = 0; nm_i != head; n++) { 224 struct netmap_slot *slot = &ring->slot[nm_i]; 225 u_int len = slot->len; 226 uint64_t paddr; 227 void *addr = PNMB(na, slot, &paddr); 228 229 /* device-specific */ 230 union ixgbe_adv_tx_desc *curr = &txr->tx_base[nic_i]; 231 struct ixgbe_tx_buf *txbuf = &txr->tx_buffers[nic_i]; 232 int flags = (slot->flags & NS_REPORT || 233 nic_i == 0 || nic_i == report_frequency) ? 234 IXGBE_TXD_CMD_RS : 0; 235 236 /* prefetch for next round */ 237 __builtin_prefetch(&ring->slot[nm_i + 1]); 238 __builtin_prefetch(&txr->tx_buffers[nic_i + 1]); 239 240 NM_CHECK_ADDR_LEN(na, addr, len); 241 242 if (slot->flags & NS_BUF_CHANGED) { 243 /* buffer has changed, reload map */ 244 netmap_reload_map(na, txr->txtag, txbuf->map, addr); 245 } 246 slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); 247 248 /* Fill the slot in the NIC ring. */ 249 /* Use legacy descriptor, they are faster? */ 250 curr->read.buffer_addr = htole64(paddr); 251 curr->read.olinfo_status = 0; 252 curr->read.cmd_type_len = htole32(len | flags | 253 IXGBE_ADVTXD_DCMD_IFCS | IXGBE_TXD_CMD_EOP); 254 255 /* make sure changes to the buffer are synced */ 256 bus_dmamap_sync(txr->txtag, txbuf->map, 257 BUS_DMASYNC_PREWRITE); 258 259 nm_i = nm_next(nm_i, lim); 260 nic_i = nm_next(nic_i, lim); 261 } 262 kring->nr_hwcur = head; 263 264 /* synchronize the NIC ring */ 265 bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, 266 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); 267 268 /* (re)start the tx unit up to slot nic_i (excluded) */ 269 IXGBE_WRITE_REG(&adapter->hw, txr->tail, nic_i); 270 } 271 272 /* 273 * Second part: reclaim buffers for completed transmissions. 274 * Because this is expensive (we read a NIC register etc.) 275 * we only do it in specific cases (see below). 276 */ 277 if (flags & NAF_FORCE_RECLAIM) { 278 reclaim_tx = 1; /* forced reclaim */ 279 } else if (!nm_kr_txempty(kring)) { 280 reclaim_tx = 0; /* have buffers, no reclaim */ 281 } else { 282 /* 283 * No buffers available. Locate previous slot with 284 * REPORT_STATUS set. 285 * If the slot has DD set, we can reclaim space, 286 * otherwise wait for the next interrupt. 287 * This enables interrupt moderation on the tx 288 * side though it might reduce throughput. 289 */ 290 struct ixgbe_legacy_tx_desc *txd = 291 (struct ixgbe_legacy_tx_desc *)txr->tx_base; 292 293 nic_i = txr->next_to_clean + report_frequency; 294 if (nic_i > lim) 295 nic_i -= lim + 1; 296 // round to the closest with dd set 297 nic_i = (nic_i < kring->nkr_num_slots / 4 || 298 nic_i >= kring->nkr_num_slots*3/4) ? 299 0 : report_frequency; 300 reclaim_tx = txd[nic_i].upper.fields.status & IXGBE_TXD_STAT_DD; // XXX cpu_to_le32 ? 301 } 302 if (reclaim_tx) { 303 /* 304 * Record completed transmissions. 305 * We (re)use the driver's txr->next_to_clean to keep 306 * track of the most recently completed transmission. 307 * 308 * The datasheet discourages the use of TDH to find 309 * out the number of sent packets, but we only set 310 * REPORT_STATUS in a few slots so TDH is the only 311 * good way. 312 */ 313 nic_i = IXGBE_READ_REG(&adapter->hw, IXGBE_IS_VF(adapter) ? 314 IXGBE_VFTDH(kring->ring_id) : IXGBE_TDH(kring->ring_id)); 315 if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ 316 D("TDH wrap %d", nic_i); 317 nic_i -= kring->nkr_num_slots; 318 } 319 if (nic_i != txr->next_to_clean) { 320 /* some tx completed, increment avail */ 321 txr->next_to_clean = nic_i; 322 kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); 323 } 324 } 325 326 nm_txsync_finalize(kring); 327 328 return 0; 329} 330 331 332/* 333 * Reconcile kernel and user view of the receive ring. 334 * Same as for the txsync, this routine must be efficient. 335 * The caller guarantees a single invocations, but races against 336 * the rest of the driver should be handled here. 337 * 338 * On call, kring->rhead is the first packet that userspace wants 339 * to keep, and kring->rcur is the wakeup point. 340 * The kernel has previously reported packets up to kring->rtail. 341 * 342 * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective 343 * of whether or not we received an interrupt. 344 */ 345static int 346ixgbe_netmap_rxsync(struct netmap_kring *kring, int flags) 347{ 348 struct netmap_adapter *na = kring->na; 349 struct ifnet *ifp = na->ifp; 350 struct netmap_ring *ring = kring->ring; 351 u_int nm_i; /* index into the netmap ring */ 352 u_int nic_i; /* index into the NIC ring */ 353 u_int n; 354 u_int const lim = kring->nkr_num_slots - 1; 355 u_int const head = nm_rxsync_prologue(kring); 356 int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; 357 358 /* device-specific */ 359 struct adapter *adapter = ifp->if_softc; 360 struct rx_ring *rxr = &adapter->rx_rings[kring->ring_id]; 361 362 if (head > lim) 363 return netmap_ring_reinit(kring); 364 365 /* XXX check sync modes */ 366 bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, 367 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); 368 369 /* 370 * First part: import newly received packets. 371 * 372 * nm_i is the index of the next free slot in the netmap ring, 373 * nic_i is the index of the next received packet in the NIC ring, 374 * and they may differ in case if_init() has been called while 375 * in netmap mode. For the receive ring we have 376 * 377 * nic_i = rxr->next_to_check; 378 * nm_i = kring->nr_hwtail (previous) 379 * and 380 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size 381 * 382 * rxr->next_to_check is set to 0 on a ring reinit 383 */ 384 if (netmap_no_pendintr || force_update) { 385 int crclen = (ix_crcstrip || IXGBE_IS_VF(adapter) ) ? 0 : 4; 386 uint16_t slot_flags = kring->nkr_slot_flags; 387 388 nic_i = rxr->next_to_check; // or also k2n(kring->nr_hwtail) 389 nm_i = netmap_idx_n2k(kring, nic_i); 390 391 for (n = 0; ; n++) { 392 union ixgbe_adv_rx_desc *curr = &rxr->rx_base[nic_i]; 393 uint32_t staterr = le32toh(curr->wb.upper.status_error); 394 395 if ((staterr & IXGBE_RXD_STAT_DD) == 0) 396 break; 397 ring->slot[nm_i].len = le16toh(curr->wb.upper.length) - crclen; 398 ring->slot[nm_i].flags = slot_flags; 399 bus_dmamap_sync(rxr->ptag, 400 rxr->rx_buffers[nic_i].pmap, BUS_DMASYNC_POSTREAD); 401 nm_i = nm_next(nm_i, lim); 402 nic_i = nm_next(nic_i, lim); 403 } 404 if (n) { /* update the state variables */ 405 if (netmap_no_pendintr && !force_update) { 406 /* diagnostics */ 407 ix_rx_miss ++; 408 ix_rx_miss_bufs += n; 409 } 410 rxr->next_to_check = nic_i; 411 kring->nr_hwtail = nm_i; 412 } 413 kring->nr_kflags &= ~NKR_PENDINTR; 414 } 415 416 /* 417 * Second part: skip past packets that userspace has released. 418 * (kring->nr_hwcur to kring->rhead excluded), 419 * and make the buffers available for reception. 420 * As usual nm_i is the index in the netmap ring, 421 * nic_i is the index in the NIC ring, and 422 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size 423 */ 424 nm_i = kring->nr_hwcur; 425 if (nm_i != head) { 426 nic_i = netmap_idx_k2n(kring, nm_i); 427 for (n = 0; nm_i != head; n++) { 428 struct netmap_slot *slot = &ring->slot[nm_i]; 429 uint64_t paddr; 430 void *addr = PNMB(na, slot, &paddr); 431 432 union ixgbe_adv_rx_desc *curr = &rxr->rx_base[nic_i]; 433 struct ixgbe_rx_buf *rxbuf = &rxr->rx_buffers[nic_i]; 434 435 if (addr == NETMAP_BUF_BASE(na)) /* bad buf */ 436 goto ring_reset; 437 438 if (slot->flags & NS_BUF_CHANGED) { 439 /* buffer has changed, reload map */ 440 netmap_reload_map(na, rxr->ptag, rxbuf->pmap, addr); 441 slot->flags &= ~NS_BUF_CHANGED; 442 } 443 curr->wb.upper.status_error = 0; 444 curr->read.pkt_addr = htole64(paddr); 445 bus_dmamap_sync(rxr->ptag, rxbuf->pmap, 446 BUS_DMASYNC_PREREAD); 447 nm_i = nm_next(nm_i, lim); 448 nic_i = nm_next(nic_i, lim); 449 } 450 kring->nr_hwcur = head; 451 452 bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, 453 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); 454 /* 455 * IMPORTANT: we must leave one free slot in the ring, 456 * so move nic_i back by one unit 457 */ 458 nic_i = nm_prev(nic_i, lim); 459 IXGBE_WRITE_REG(&adapter->hw, rxr->tail, nic_i); 460 } 461 462 /* tell userspace that there might be new packets */ 463 nm_rxsync_finalize(kring); 464 465 return 0; 466 467ring_reset: 468 return netmap_ring_reinit(kring); 469} 470 471 472/* 473 * The attach routine, called near the end of ixgbe_attach(), 474 * fills the parameters for netmap_attach() and calls it. 475 * It cannot fail, in the worst case (such as no memory) 476 * netmap mode will be disabled and the driver will only 477 * operate in standard mode. 478 */ 479void 480ixgbe_netmap_attach(struct adapter *adapter) 481{ 482 struct netmap_adapter na; 483 484 bzero(&na, sizeof(na)); 485 486 na.ifp = adapter->ifp; 487 na.na_flags = NAF_BDG_MAYSLEEP; 488 na.num_tx_desc = adapter->num_tx_desc; 489 na.num_rx_desc = adapter->num_rx_desc; 490 na.nm_txsync = ixgbe_netmap_txsync; 491 na.nm_rxsync = ixgbe_netmap_rxsync; 492 na.nm_register = ixgbe_netmap_reg; 493 na.num_tx_rings = na.num_rx_rings = adapter->num_queues; 494 netmap_attach(&na); 495} 496 497/* end of file */ 498