sfxge_rx.c revision 350409
1/*- 2 * Copyright (c) 2010-2016 Solarflare Communications Inc. 3 * All rights reserved. 4 * 5 * This software was developed in part by Philip Paeps under contract for 6 * Solarflare Communications, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright notice, 12 * this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 * 29 * The views and conclusions contained in the software and documentation are 30 * those of the authors and should not be interpreted as representing official 31 * policies, either expressed or implied, of the FreeBSD Project. 32 */ 33 34#include <sys/cdefs.h> 35__FBSDID("$FreeBSD: stable/11/sys/dev/sfxge/sfxge_rx.c 350409 2019-07-29 10:41:21Z arybchik $"); 36 37#include "opt_rss.h" 38 39#include <sys/param.h> 40#include <sys/malloc.h> 41#include <sys/mbuf.h> 42#include <sys/smp.h> 43#include <sys/socket.h> 44#include <sys/sysctl.h> 45#include <sys/syslog.h> 46#include <sys/limits.h> 47#include <sys/syslog.h> 48 49#include <net/ethernet.h> 50#include <net/if.h> 51#include <net/if_vlan_var.h> 52 53#include <netinet/in.h> 54#include <netinet/ip.h> 55#include <netinet/ip6.h> 56#include <netinet/tcp.h> 57 58#include <machine/in_cksum.h> 59 60#ifdef RSS 61#include <net/rss_config.h> 62#endif 63 64#include "common/efx.h" 65 66 67#include "sfxge.h" 68#include "sfxge_rx.h" 69 70#define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10) 71 72#ifdef SFXGE_LRO 73 74SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL, 75 "Large receive offload (LRO) parameters"); 76 77#define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param) 78 79/* Size of the LRO hash table. Must be a power of 2. A larger table 80 * means we can accelerate a larger number of streams. 81 */ 82static unsigned lro_table_size = 128; 83TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size); 84SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN, 85 &lro_table_size, 0, 86 "Size of the LRO hash table (must be a power of 2)"); 87 88/* Maximum length of a hash chain. If chains get too long then the lookup 89 * time increases and may exceed the benefit of LRO. 90 */ 91static unsigned lro_chain_max = 20; 92TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max); 93SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN, 94 &lro_chain_max, 0, 95 "The maximum length of a hash chain"); 96 97/* Maximum time (in ticks) that a connection can be idle before it's LRO 98 * state is discarded. 99 */ 100static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 101TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks); 102SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN, 103 &lro_idle_ticks, 0, 104 "The maximum time (in ticks) that a connection can be idle " 105 "before it's LRO state is discarded"); 106 107/* Number of packets with payload that must arrive in-order before a 108 * connection is eligible for LRO. The idea is we should avoid coalescing 109 * segments when the sender is in slow-start because reducing the ACK rate 110 * can damage performance. 111 */ 112static int lro_slow_start_packets = 2000; 113TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets); 114SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN, 115 &lro_slow_start_packets, 0, 116 "Number of packets with payload that must arrive in-order before " 117 "a connection is eligible for LRO"); 118 119/* Number of packets with payload that must arrive in-order following loss 120 * before a connection is eligible for LRO. The idea is we should avoid 121 * coalescing segments when the sender is recovering from loss, because 122 * reducing the ACK rate can damage performance. 123 */ 124static int lro_loss_packets = 20; 125TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets); 126SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN, 127 &lro_loss_packets, 0, 128 "Number of packets with payload that must arrive in-order " 129 "following loss before a connection is eligible for LRO"); 130 131/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 132#define SFXGE_LRO_L2_ID_VLAN 0x4000 133#define SFXGE_LRO_L2_ID_IPV6 0x8000 134#define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 135#define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 136 137/* Compare IPv6 addresses, avoiding conditional branches */ 138static unsigned long ipv6_addr_cmp(const struct in6_addr *left, 139 const struct in6_addr *right) 140{ 141#if LONG_BIT == 64 142 const uint64_t *left64 = (const uint64_t *)left; 143 const uint64_t *right64 = (const uint64_t *)right; 144 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 145#else 146 return (left->s6_addr32[0] - right->s6_addr32[0]) | 147 (left->s6_addr32[1] - right->s6_addr32[1]) | 148 (left->s6_addr32[2] - right->s6_addr32[2]) | 149 (left->s6_addr32[3] - right->s6_addr32[3]); 150#endif 151} 152 153#endif /* SFXGE_LRO */ 154 155void 156sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 157{ 158 159 rxq->flush_state = SFXGE_FLUSH_DONE; 160} 161 162void 163sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 164{ 165 166 rxq->flush_state = SFXGE_FLUSH_FAILED; 167} 168 169#ifdef RSS 170static uint8_t toep_key[RSS_KEYSIZE]; 171#else 172static uint8_t toep_key[] = { 173 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 174 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 175 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 176 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 177 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 178}; 179#endif 180 181static void 182sfxge_rx_post_refill(void *arg) 183{ 184 struct sfxge_rxq *rxq = arg; 185 struct sfxge_softc *sc; 186 unsigned int index; 187 struct sfxge_evq *evq; 188 uint16_t magic; 189 190 sc = rxq->sc; 191 index = rxq->index; 192 evq = sc->evq[index]; 193 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq); 194 195 /* This is guaranteed due to the start/stop order of rx and ev */ 196 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 197 ("evq not started")); 198 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 199 ("rxq not started")); 200 efx_ev_qpost(evq->common, magic); 201} 202 203static void 204sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 205{ 206 /* Initially retry after 100 ms, but back off in case of 207 * repeated failures as we probably have to wait for the 208 * administrator to raise the pool limit. */ 209 if (retrying) 210 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 211 else 212 rxq->refill_delay = hz / 10; 213 214 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 215 sfxge_rx_post_refill, rxq); 216} 217 218#define SFXGE_REFILL_BATCH 64 219 220static void 221sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 222{ 223 struct sfxge_softc *sc; 224 unsigned int index; 225 struct sfxge_evq *evq; 226 unsigned int batch; 227 unsigned int rxfill; 228 unsigned int mblksize; 229 int ntodo; 230 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 231 232 sc = rxq->sc; 233 index = rxq->index; 234 evq = sc->evq[index]; 235 236 prefetch_read_many(sc->enp); 237 prefetch_read_many(rxq->common); 238 239 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 240 241 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 242 return; 243 244 rxfill = rxq->added - rxq->completed; 245 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries), 246 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)")); 247 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target); 248 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries), 249 ("ntodo > EFX_RQX_LIMIT(rxq->entries)")); 250 251 if (ntodo == 0) 252 return; 253 254 batch = 0; 255 mblksize = sc->rx_buffer_size - sc->rx_buffer_align; 256 while (ntodo-- > 0) { 257 unsigned int id; 258 struct sfxge_rx_sw_desc *rx_desc; 259 bus_dma_segment_t seg; 260 struct mbuf *m; 261 262 id = (rxq->added + batch) & rxq->ptr_mask; 263 rx_desc = &rxq->queue[id]; 264 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 265 266 rx_desc->flags = EFX_DISCARD; 267 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, 268 sc->rx_cluster_size); 269 if (m == NULL) 270 break; 271 272 /* m_len specifies length of area to be mapped for DMA */ 273 m->m_len = mblksize; 274 m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data, 275 CACHE_LINE_SIZE); 276 m->m_data += sc->rx_buffer_align; 277 278 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 279 addr[batch++] = seg.ds_addr; 280 281 if (batch == SFXGE_REFILL_BATCH) { 282 efx_rx_qpost(rxq->common, addr, mblksize, batch, 283 rxq->completed, rxq->added); 284 rxq->added += batch; 285 batch = 0; 286 } 287 } 288 289 if (ntodo != 0) 290 sfxge_rx_schedule_refill(rxq, retrying); 291 292 if (batch != 0) { 293 efx_rx_qpost(rxq->common, addr, mblksize, batch, 294 rxq->completed, rxq->added); 295 rxq->added += batch; 296 } 297 298 /* Make the descriptors visible to the hardware */ 299 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 300 BUS_DMASYNC_PREWRITE); 301 302 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed); 303 304 /* The queue could still be empty if no descriptors were actually 305 * pushed, in which case there will be no event to cause the next 306 * refill, so we must schedule a refill ourselves. 307 */ 308 if(rxq->pushed == rxq->completed) { 309 sfxge_rx_schedule_refill(rxq, retrying); 310 } 311} 312 313void 314sfxge_rx_qrefill(struct sfxge_rxq *rxq) 315{ 316 317 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 318 return; 319 320 /* Make sure the queue is full */ 321 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE); 322} 323 324static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 325{ 326 struct ifnet *ifp = sc->ifnet; 327 328 m->m_pkthdr.rcvif = ifp; 329 m->m_pkthdr.csum_data = 0xffff; 330 ifp->if_input(ifp, m); 331} 332 333static void 334sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc) 335{ 336 struct sfxge_softc *sc = rxq->sc; 337 struct mbuf *m = rx_desc->mbuf; 338 int flags = rx_desc->flags; 339 int csum_flags; 340 341 /* Convert checksum flags */ 342 csum_flags = (flags & EFX_CKSUM_IPV4) ? 343 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 344 if (flags & EFX_CKSUM_TCPUDP) 345 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 346 347 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 348 m->m_pkthdr.flowid = 349 efx_pseudo_hdr_hash_get(rxq->common, 350 EFX_RX_HASHALG_TOEPLITZ, 351 mtod(m, uint8_t *)); 352 /* The hash covers a 4-tuple for TCP only */ 353 M_HASHTYPE_SET(m, 354 (flags & EFX_PKT_IPV4) ? 355 ((flags & EFX_PKT_TCP) ? 356 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) : 357 ((flags & EFX_PKT_TCP) ? 358 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6)); 359 } 360 m->m_data += sc->rx_prefix_size; 361 m->m_len = rx_desc->size - sc->rx_prefix_size; 362 m->m_pkthdr.len = m->m_len; 363 m->m_pkthdr.csum_flags = csum_flags; 364 __sfxge_rx_deliver(sc, rx_desc->mbuf); 365 366 rx_desc->flags = EFX_DISCARD; 367 rx_desc->mbuf = NULL; 368} 369 370#ifdef SFXGE_LRO 371 372static void 373sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 374{ 375 struct sfxge_softc *sc = st->sc; 376 struct mbuf *m = c->mbuf; 377 struct tcphdr *c_th; 378 int csum_flags; 379 380 KASSERT(m, ("no mbuf to deliver")); 381 382 ++st->n_bursts; 383 384 /* Finish off packet munging and recalculate IP header checksum. */ 385 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 386 struct ip *iph = c->nh; 387 iph->ip_len = htons(iph->ip_len); 388 iph->ip_sum = 0; 389 iph->ip_sum = in_cksum_hdr(iph); 390 c_th = (struct tcphdr *)(iph + 1); 391 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 392 CSUM_IP_CHECKED | CSUM_IP_VALID); 393 } else { 394 struct ip6_hdr *iph = c->nh; 395 iph->ip6_plen = htons(iph->ip6_plen); 396 c_th = (struct tcphdr *)(iph + 1); 397 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 398 } 399 400 c_th->th_win = c->th_last->th_win; 401 c_th->th_ack = c->th_last->th_ack; 402 if (c_th->th_off == c->th_last->th_off) { 403 /* Copy TCP options (take care to avoid going negative). */ 404 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 405 memcpy(c_th + 1, c->th_last + 1, optlen); 406 } 407 408 m->m_pkthdr.flowid = c->conn_hash; 409 M_HASHTYPE_SET(m, 410 SFXGE_LRO_CONN_IS_TCPIPV4(c) ? 411 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6); 412 413 m->m_pkthdr.csum_flags = csum_flags; 414 __sfxge_rx_deliver(sc, m); 415 416 c->mbuf = NULL; 417 c->delivered = 1; 418} 419 420/* Drop the given connection, and add it to the free list. */ 421static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 422{ 423 unsigned bucket; 424 425 KASSERT(!c->mbuf, ("found orphaned mbuf")); 426 427 if (c->next_buf.mbuf != NULL) { 428 sfxge_rx_deliver(rxq, &c->next_buf); 429 LIST_REMOVE(c, active_link); 430 } 431 432 bucket = c->conn_hash & rxq->lro.conns_mask; 433 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 434 --rxq->lro.conns_n[bucket]; 435 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 436 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 437} 438 439/* Stop tracking connections that have gone idle in order to keep hash 440 * chains short. 441 */ 442static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 443{ 444 struct sfxge_lro_conn *c; 445 unsigned i; 446 447 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 448 ("found active connections")); 449 450 rxq->lro.last_purge_ticks = now; 451 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 452 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 453 continue; 454 455 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 456 if (now - c->last_pkt_ticks > lro_idle_ticks) { 457 ++rxq->lro.n_drop_idle; 458 sfxge_lro_drop(rxq, c); 459 } 460 } 461} 462 463static void 464sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 465 struct mbuf *mbuf, struct tcphdr *th) 466{ 467 struct tcphdr *c_th; 468 469 /* Tack the new mbuf onto the chain. */ 470 KASSERT(!mbuf->m_next, ("mbuf already chained")); 471 c->mbuf_tail->m_next = mbuf; 472 c->mbuf_tail = mbuf; 473 474 /* Increase length appropriately */ 475 c->mbuf->m_pkthdr.len += mbuf->m_len; 476 477 /* Update the connection state flags */ 478 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 479 struct ip *iph = c->nh; 480 iph->ip_len += mbuf->m_len; 481 c_th = (struct tcphdr *)(iph + 1); 482 } else { 483 struct ip6_hdr *iph = c->nh; 484 iph->ip6_plen += mbuf->m_len; 485 c_th = (struct tcphdr *)(iph + 1); 486 } 487 c_th->th_flags |= (th->th_flags & TH_PUSH); 488 c->th_last = th; 489 ++st->n_merges; 490 491 /* Pass packet up now if another segment could overflow the IP 492 * length. 493 */ 494 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 495 sfxge_lro_deliver(st, c); 496} 497 498static void 499sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 500 struct mbuf *mbuf, void *nh, struct tcphdr *th) 501{ 502 /* Start the chain */ 503 c->mbuf = mbuf; 504 c->mbuf_tail = c->mbuf; 505 c->nh = nh; 506 c->th_last = th; 507 508 mbuf->m_pkthdr.len = mbuf->m_len; 509 510 /* Mangle header fields for later processing */ 511 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 512 struct ip *iph = nh; 513 iph->ip_len = ntohs(iph->ip_len); 514 } else { 515 struct ip6_hdr *iph = nh; 516 iph->ip6_plen = ntohs(iph->ip6_plen); 517 } 518} 519 520/* Try to merge or otherwise hold or deliver (as appropriate) the 521 * packet buffered for this connection (c->next_buf). Return a flag 522 * indicating whether the connection is still active for LRO purposes. 523 */ 524static int 525sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 526{ 527 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 528 char *eh = c->next_eh; 529 int data_length, hdr_length, dont_merge; 530 unsigned th_seq, pkt_length; 531 struct tcphdr *th; 532 unsigned now; 533 534 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 535 struct ip *iph = c->next_nh; 536 th = (struct tcphdr *)(iph + 1); 537 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 538 } else { 539 struct ip6_hdr *iph = c->next_nh; 540 th = (struct tcphdr *)(iph + 1); 541 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 542 } 543 544 hdr_length = (char *) th + th->th_off * 4 - eh; 545 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 546 hdr_length); 547 th_seq = ntohl(th->th_seq); 548 dont_merge = ((data_length <= 0) 549 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 550 551 /* Check for options other than aligned timestamp. */ 552 if (th->th_off != 5) { 553 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 554 if (th->th_off == 8 && 555 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 556 (TCPOPT_NOP << 16) | 557 (TCPOPT_TIMESTAMP << 8) | 558 TCPOLEN_TIMESTAMP)) { 559 /* timestamp option -- okay */ 560 } else { 561 dont_merge = 1; 562 } 563 } 564 565 if (__predict_false(th_seq != c->next_seq)) { 566 /* Out-of-order, so start counting again. */ 567 if (c->mbuf != NULL) 568 sfxge_lro_deliver(&rxq->lro, c); 569 c->n_in_order_pkts -= lro_loss_packets; 570 c->next_seq = th_seq + data_length; 571 ++rxq->lro.n_misorder; 572 goto deliver_buf_out; 573 } 574 c->next_seq = th_seq + data_length; 575 576 now = ticks; 577 if (now - c->last_pkt_ticks > lro_idle_ticks) { 578 ++rxq->lro.n_drop_idle; 579 if (c->mbuf != NULL) 580 sfxge_lro_deliver(&rxq->lro, c); 581 sfxge_lro_drop(rxq, c); 582 return (0); 583 } 584 c->last_pkt_ticks = ticks; 585 586 if (c->n_in_order_pkts < lro_slow_start_packets) { 587 /* May be in slow-start, so don't merge. */ 588 ++rxq->lro.n_slow_start; 589 ++c->n_in_order_pkts; 590 goto deliver_buf_out; 591 } 592 593 if (__predict_false(dont_merge)) { 594 if (c->mbuf != NULL) 595 sfxge_lro_deliver(&rxq->lro, c); 596 if (th->th_flags & (TH_FIN | TH_RST)) { 597 ++rxq->lro.n_drop_closed; 598 sfxge_lro_drop(rxq, c); 599 return (0); 600 } 601 goto deliver_buf_out; 602 } 603 604 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 605 606 if (__predict_true(c->mbuf != NULL)) { 607 /* Remove headers and any padding */ 608 rx_buf->mbuf->m_data += hdr_length; 609 rx_buf->mbuf->m_len = data_length; 610 611 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 612 } else { 613 /* Remove any padding */ 614 rx_buf->mbuf->m_len = pkt_length; 615 616 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 617 } 618 619 rx_buf->mbuf = NULL; 620 return (1); 621 622 deliver_buf_out: 623 sfxge_rx_deliver(rxq, rx_buf); 624 return (1); 625} 626 627static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 628 uint16_t l2_id, void *nh, struct tcphdr *th) 629{ 630 unsigned bucket = conn_hash & st->conns_mask; 631 struct sfxge_lro_conn *c; 632 633 if (st->conns_n[bucket] >= lro_chain_max) { 634 ++st->n_too_many; 635 return; 636 } 637 638 if (!TAILQ_EMPTY(&st->free_conns)) { 639 c = TAILQ_FIRST(&st->free_conns); 640 TAILQ_REMOVE(&st->free_conns, c, link); 641 } else { 642 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 643 if (c == NULL) 644 return; 645 c->mbuf = NULL; 646 c->next_buf.mbuf = NULL; 647 } 648 649 /* Create the connection tracking data */ 650 ++st->conns_n[bucket]; 651 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 652 c->l2_id = l2_id; 653 c->conn_hash = conn_hash; 654 c->source = th->th_sport; 655 c->dest = th->th_dport; 656 c->n_in_order_pkts = 0; 657 c->last_pkt_ticks = *(volatile int *)&ticks; 658 c->delivered = 0; 659 ++st->n_new_stream; 660 /* NB. We don't initialise c->next_seq, and it doesn't matter what 661 * value it has. Most likely the next packet received for this 662 * connection will not match -- no harm done. 663 */ 664} 665 666/* Process mbuf and decide whether to dispatch it to the stack now or 667 * later. 668 */ 669static void 670sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 671{ 672 struct sfxge_softc *sc = rxq->sc; 673 struct mbuf *m = rx_buf->mbuf; 674 struct ether_header *eh; 675 struct sfxge_lro_conn *c; 676 uint16_t l2_id; 677 uint16_t l3_proto; 678 void *nh; 679 struct tcphdr *th; 680 uint32_t conn_hash; 681 unsigned bucket; 682 683 /* Get the hardware hash */ 684 conn_hash = efx_pseudo_hdr_hash_get(rxq->common, 685 EFX_RX_HASHALG_TOEPLITZ, 686 mtod(m, uint8_t *)); 687 688 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 689 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 690 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 691 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 692 SFXGE_LRO_L2_ID_VLAN; 693 l3_proto = veh->evl_proto; 694 nh = veh + 1; 695 } else { 696 l2_id = 0; 697 l3_proto = eh->ether_type; 698 nh = eh + 1; 699 } 700 701 /* Check whether this is a suitable packet (unfragmented 702 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 703 * length, and compute a hash if necessary. If not, return. 704 */ 705 if (l3_proto == htons(ETHERTYPE_IP)) { 706 struct ip *iph = nh; 707 708 KASSERT(iph->ip_p == IPPROTO_TCP, 709 ("IPv4 protocol is not TCP, but packet marker is set")); 710 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) | 711 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 712 goto deliver_now; 713 th = (struct tcphdr *)(iph + 1); 714 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 715 struct ip6_hdr *iph = nh; 716 717 KASSERT(iph->ip6_nxt == IPPROTO_TCP, 718 ("IPv6 next header is not TCP, but packet marker is set")); 719 l2_id |= SFXGE_LRO_L2_ID_IPV6; 720 th = (struct tcphdr *)(iph + 1); 721 } else { 722 goto deliver_now; 723 } 724 725 bucket = conn_hash & rxq->lro.conns_mask; 726 727 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 728 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 729 continue; 730 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 731 continue; 732 if (c->mbuf != NULL) { 733 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 734 struct ip *c_iph, *iph = nh; 735 c_iph = c->nh; 736 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 737 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 738 continue; 739 } else { 740 struct ip6_hdr *c_iph, *iph = nh; 741 c_iph = c->nh; 742 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 743 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 744 continue; 745 } 746 } 747 748 /* Re-insert at head of list to reduce lookup time. */ 749 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 750 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 751 752 if (c->next_buf.mbuf != NULL) { 753 if (!sfxge_lro_try_merge(rxq, c)) 754 goto deliver_now; 755 } else { 756 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 757 active_link); 758 } 759 c->next_buf = *rx_buf; 760 c->next_eh = eh; 761 c->next_nh = nh; 762 763 rx_buf->mbuf = NULL; 764 rx_buf->flags = EFX_DISCARD; 765 return; 766 } 767 768 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 769 deliver_now: 770 sfxge_rx_deliver(rxq, rx_buf); 771} 772 773static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 774{ 775 struct sfxge_lro_state *st = &rxq->lro; 776 struct sfxge_lro_conn *c; 777 unsigned t; 778 779 while (!LIST_EMPTY(&st->active_conns)) { 780 c = LIST_FIRST(&st->active_conns); 781 if (!c->delivered && c->mbuf != NULL) 782 sfxge_lro_deliver(st, c); 783 if (sfxge_lro_try_merge(rxq, c)) { 784 if (c->mbuf != NULL) 785 sfxge_lro_deliver(st, c); 786 LIST_REMOVE(c, active_link); 787 } 788 c->delivered = 0; 789 } 790 791 t = *(volatile int *)&ticks; 792 if (__predict_false(t != st->last_purge_ticks)) 793 sfxge_lro_purge_idle(rxq, t); 794} 795 796#else /* !SFXGE_LRO */ 797 798static void 799sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 800{ 801} 802 803static void 804sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 805{ 806} 807 808#endif /* SFXGE_LRO */ 809 810void 811sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 812{ 813 struct sfxge_softc *sc = rxq->sc; 814 int if_capenable = sc->ifnet->if_capenable; 815 int lro_enabled = if_capenable & IFCAP_LRO; 816 unsigned int index; 817 struct sfxge_evq *evq; 818 unsigned int completed; 819 unsigned int level; 820 struct mbuf *m; 821 struct sfxge_rx_sw_desc *prev = NULL; 822 823 index = rxq->index; 824 evq = sc->evq[index]; 825 826 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 827 828 completed = rxq->completed; 829 while (completed != rxq->pending) { 830 unsigned int id; 831 struct sfxge_rx_sw_desc *rx_desc; 832 833 id = completed++ & rxq->ptr_mask; 834 rx_desc = &rxq->queue[id]; 835 m = rx_desc->mbuf; 836 837 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 838 goto discard; 839 840 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 841 goto discard; 842 843 /* Read the length from the pseudo header if required */ 844 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) { 845 uint16_t tmp_size; 846 int rc; 847 rc = efx_pseudo_hdr_pkt_length_get(rxq->common, 848 mtod(m, uint8_t *), 849 &tmp_size); 850 KASSERT(rc == 0, ("cannot get packet length: %d", rc)); 851 rx_desc->size = (int)tmp_size + sc->rx_prefix_size; 852 } 853 854 prefetch_read_many(mtod(m, caddr_t)); 855 856 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 857 case EFX_PKT_IPV4: 858 if (~if_capenable & IFCAP_RXCSUM) 859 rx_desc->flags &= 860 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP); 861 break; 862 case EFX_PKT_IPV6: 863 if (~if_capenable & IFCAP_RXCSUM_IPV6) 864 rx_desc->flags &= ~EFX_CKSUM_TCPUDP; 865 break; 866 case 0: 867 /* Check for loopback packets */ 868 { 869 struct ether_header *etherhp; 870 871 /*LINTED*/ 872 etherhp = mtod(m, struct ether_header *); 873 874 if (etherhp->ether_type == 875 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 876 EFSYS_PROBE(loopback); 877 878 rxq->loopback++; 879 goto discard; 880 } 881 } 882 break; 883 default: 884 KASSERT(B_FALSE, 885 ("Rx descriptor with both IPv4 and IPv6 flags")); 886 goto discard; 887 } 888 889 /* Pass packet up the stack or into LRO (pipelined) */ 890 if (prev != NULL) { 891 if (lro_enabled && 892 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 893 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 894 sfxge_lro(rxq, prev); 895 else 896 sfxge_rx_deliver(rxq, prev); 897 } 898 prev = rx_desc; 899 continue; 900 901discard: 902 /* Return the packet to the pool */ 903 m_free(m); 904 rx_desc->mbuf = NULL; 905 } 906 rxq->completed = completed; 907 908 level = rxq->added - rxq->completed; 909 910 /* Pass last packet up the stack or into LRO */ 911 if (prev != NULL) { 912 if (lro_enabled && 913 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 914 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 915 sfxge_lro(rxq, prev); 916 else 917 sfxge_rx_deliver(rxq, prev); 918 } 919 920 /* 921 * If there are any pending flows and this is the end of the 922 * poll then they must be completed. 923 */ 924 if (eop) 925 sfxge_lro_end_of_burst(rxq); 926 927 /* Top up the queue if necessary */ 928 if (level < rxq->refill_threshold) 929 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE); 930} 931 932static void 933sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 934{ 935 struct sfxge_rxq *rxq; 936 struct sfxge_evq *evq; 937 unsigned int count; 938 unsigned int retry = 3; 939 940 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 941 942 rxq = sc->rxq[index]; 943 evq = sc->evq[index]; 944 945 SFXGE_EVQ_LOCK(evq); 946 947 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 948 ("rxq not started")); 949 950 rxq->init_state = SFXGE_RXQ_INITIALIZED; 951 952 callout_stop(&rxq->refill_callout); 953 954 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) { 955 rxq->flush_state = SFXGE_FLUSH_PENDING; 956 957 SFXGE_EVQ_UNLOCK(evq); 958 959 /* Flush the receive queue */ 960 if (efx_rx_qflush(rxq->common) != 0) { 961 SFXGE_EVQ_LOCK(evq); 962 rxq->flush_state = SFXGE_FLUSH_FAILED; 963 break; 964 } 965 966 count = 0; 967 do { 968 /* Spin for 100 ms */ 969 DELAY(100000); 970 971 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 972 break; 973 974 } while (++count < 20); 975 976 SFXGE_EVQ_LOCK(evq); 977 978 if (rxq->flush_state == SFXGE_FLUSH_PENDING) { 979 /* Flush timeout - neither done nor failed */ 980 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n", 981 device_get_nameunit(sc->dev), index); 982 rxq->flush_state = SFXGE_FLUSH_DONE; 983 } 984 retry--; 985 } 986 if (rxq->flush_state == SFXGE_FLUSH_FAILED) { 987 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n", 988 device_get_nameunit(sc->dev), index); 989 rxq->flush_state = SFXGE_FLUSH_DONE; 990 } 991 992 rxq->pending = rxq->added; 993 sfxge_rx_qcomplete(rxq, B_TRUE); 994 995 KASSERT(rxq->completed == rxq->pending, 996 ("rxq->completed != rxq->pending")); 997 998 rxq->added = 0; 999 rxq->pushed = 0; 1000 rxq->pending = 0; 1001 rxq->completed = 0; 1002 rxq->loopback = 0; 1003 1004 /* Destroy the common code receive queue. */ 1005 efx_rx_qdestroy(rxq->common); 1006 1007 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1008 EFX_RXQ_NBUFS(sc->rxq_entries)); 1009 1010 SFXGE_EVQ_UNLOCK(evq); 1011} 1012 1013static int 1014sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 1015{ 1016 struct sfxge_rxq *rxq; 1017 efsys_mem_t *esmp; 1018 struct sfxge_evq *evq; 1019 int rc; 1020 1021 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 1022 1023 rxq = sc->rxq[index]; 1024 esmp = &rxq->mem; 1025 evq = sc->evq[index]; 1026 1027 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1028 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1029 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 1030 ("evq->init_state != SFXGE_EVQ_STARTED")); 1031 1032 /* Program the buffer table. */ 1033 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 1034 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0) 1035 return (rc); 1036 1037 /* Create the common code receive queue. */ 1038 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT, 1039 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common, 1040 &rxq->common)) != 0) 1041 goto fail; 1042 1043 SFXGE_EVQ_LOCK(evq); 1044 1045 /* Enable the receive queue. */ 1046 efx_rx_qenable(rxq->common); 1047 1048 rxq->init_state = SFXGE_RXQ_STARTED; 1049 rxq->flush_state = SFXGE_FLUSH_REQUIRED; 1050 1051 /* Try to fill the queue from the pool. */ 1052 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE); 1053 1054 SFXGE_EVQ_UNLOCK(evq); 1055 1056 return (0); 1057 1058fail: 1059 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1060 EFX_RXQ_NBUFS(sc->rxq_entries)); 1061 return (rc); 1062} 1063 1064void 1065sfxge_rx_stop(struct sfxge_softc *sc) 1066{ 1067 int index; 1068 1069 efx_mac_filter_default_rxq_clear(sc->enp); 1070 1071 /* Stop the receive queue(s) */ 1072 index = sc->rxq_count; 1073 while (--index >= 0) 1074 sfxge_rx_qstop(sc, index); 1075 1076 sc->rx_prefix_size = 0; 1077 sc->rx_buffer_size = 0; 1078 1079 efx_rx_fini(sc->enp); 1080} 1081 1082int 1083sfxge_rx_start(struct sfxge_softc *sc) 1084{ 1085 struct sfxge_intr *intr; 1086 const efx_nic_cfg_t *encp; 1087 size_t hdrlen, align, reserved; 1088 int index; 1089 int rc; 1090 1091 intr = &sc->intr; 1092 1093 /* Initialize the common code receive module. */ 1094 if ((rc = efx_rx_init(sc->enp)) != 0) 1095 return (rc); 1096 1097 encp = efx_nic_cfg_get(sc->enp); 1098 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu); 1099 1100 /* Calculate the receive packet buffer size. */ 1101 sc->rx_prefix_size = encp->enc_rx_prefix_size; 1102 1103 /* Ensure IP headers are 32bit aligned */ 1104 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header); 1105 sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen; 1106 1107 sc->rx_buffer_size += sc->rx_buffer_align; 1108 1109 /* Align end of packet buffer for RX DMA end padding */ 1110 align = MAX(1, encp->enc_rx_buf_align_end); 1111 EFSYS_ASSERT(ISP2(align)); 1112 sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align); 1113 1114 /* 1115 * Standard mbuf zones only guarantee pointer-size alignment; 1116 * we need extra space to align to the cache line 1117 */ 1118 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE; 1119 1120 /* Select zone for packet buffers */ 1121 if (reserved <= MCLBYTES) 1122 sc->rx_cluster_size = MCLBYTES; 1123 else if (reserved <= MJUMPAGESIZE) 1124 sc->rx_cluster_size = MJUMPAGESIZE; 1125 else if (reserved <= MJUM9BYTES) 1126 sc->rx_cluster_size = MJUM9BYTES; 1127 else 1128 sc->rx_cluster_size = MJUM16BYTES; 1129 1130 /* 1131 * Set up the scale table. Enable all hash types and hash insertion. 1132 */ 1133 for (index = 0; index < nitems(sc->rx_indir_table); index++) 1134#ifdef RSS 1135 sc->rx_indir_table[index] = 1136 rss_get_indirection_to_bucket(index) % sc->rxq_count; 1137#else 1138 sc->rx_indir_table[index] = index % sc->rxq_count; 1139#endif 1140 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table, 1141 nitems(sc->rx_indir_table))) != 0) 1142 goto fail; 1143 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ, 1144 EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 | 1145 EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE); 1146 1147#ifdef RSS 1148 rss_getkey(toep_key); 1149#endif 1150 if ((rc = efx_rx_scale_key_set(sc->enp, toep_key, 1151 sizeof(toep_key))) != 0) 1152 goto fail; 1153 1154 /* Start the receive queue(s). */ 1155 for (index = 0; index < sc->rxq_count; index++) { 1156 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1157 goto fail2; 1158 } 1159 1160 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common, 1161 sc->intr.n_alloc > 1); 1162 if (rc != 0) 1163 goto fail3; 1164 1165 return (0); 1166 1167fail3: 1168fail2: 1169 while (--index >= 0) 1170 sfxge_rx_qstop(sc, index); 1171 1172fail: 1173 efx_rx_fini(sc->enp); 1174 1175 return (rc); 1176} 1177 1178#ifdef SFXGE_LRO 1179 1180static void sfxge_lro_init(struct sfxge_rxq *rxq) 1181{ 1182 struct sfxge_lro_state *st = &rxq->lro; 1183 unsigned i; 1184 1185 st->conns_mask = lro_table_size - 1; 1186 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1187 ("lro_table_size must be a power of 2")); 1188 st->sc = rxq->sc; 1189 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1190 M_SFXGE, M_WAITOK); 1191 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1192 M_SFXGE, M_WAITOK); 1193 for (i = 0; i <= st->conns_mask; ++i) { 1194 TAILQ_INIT(&st->conns[i]); 1195 st->conns_n[i] = 0; 1196 } 1197 LIST_INIT(&st->active_conns); 1198 TAILQ_INIT(&st->free_conns); 1199} 1200 1201static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1202{ 1203 struct sfxge_lro_state *st = &rxq->lro; 1204 struct sfxge_lro_conn *c; 1205 unsigned i; 1206 1207 /* Return cleanly if sfxge_lro_init() has not been called. */ 1208 if (st->conns == NULL) 1209 return; 1210 1211 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1212 1213 for (i = 0; i <= st->conns_mask; ++i) { 1214 while (!TAILQ_EMPTY(&st->conns[i])) { 1215 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1216 sfxge_lro_drop(rxq, c); 1217 } 1218 } 1219 1220 while (!TAILQ_EMPTY(&st->free_conns)) { 1221 c = TAILQ_FIRST(&st->free_conns); 1222 TAILQ_REMOVE(&st->free_conns, c, link); 1223 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1224 free(c, M_SFXGE); 1225 } 1226 1227 free(st->conns_n, M_SFXGE); 1228 free(st->conns, M_SFXGE); 1229 st->conns = NULL; 1230} 1231 1232#else 1233 1234static void 1235sfxge_lro_init(struct sfxge_rxq *rxq) 1236{ 1237} 1238 1239static void 1240sfxge_lro_fini(struct sfxge_rxq *rxq) 1241{ 1242} 1243 1244#endif /* SFXGE_LRO */ 1245 1246static void 1247sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1248{ 1249 struct sfxge_rxq *rxq; 1250 1251 rxq = sc->rxq[index]; 1252 1253 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1254 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1255 1256 /* Free the context array and the flow table. */ 1257 free(rxq->queue, M_SFXGE); 1258 sfxge_lro_fini(rxq); 1259 1260 /* Release DMA memory. */ 1261 sfxge_dma_free(&rxq->mem); 1262 1263 sc->rxq[index] = NULL; 1264 1265 free(rxq, M_SFXGE); 1266} 1267 1268static int 1269sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1270{ 1271 struct sfxge_rxq *rxq; 1272 struct sfxge_evq *evq; 1273 efsys_mem_t *esmp; 1274 int rc; 1275 1276 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count)); 1277 1278 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1279 rxq->sc = sc; 1280 rxq->index = index; 1281 rxq->entries = sc->rxq_entries; 1282 rxq->ptr_mask = rxq->entries - 1; 1283 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries); 1284 1285 sc->rxq[index] = rxq; 1286 esmp = &rxq->mem; 1287 1288 evq = sc->evq[index]; 1289 1290 /* Allocate and zero DMA space. */ 1291 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0) 1292 return (rc); 1293 1294 /* Allocate buffer table entries. */ 1295 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries), 1296 &rxq->buf_base_id); 1297 1298 /* Allocate the context array and the flow table. */ 1299 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries, 1300 M_SFXGE, M_WAITOK | M_ZERO); 1301 sfxge_lro_init(rxq); 1302 1303 callout_init(&rxq->refill_callout, 1); 1304 1305 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1306 1307 return (0); 1308} 1309 1310static const struct { 1311 const char *name; 1312 size_t offset; 1313} sfxge_rx_stats[] = { 1314#define SFXGE_RX_STAT(name, member) \ 1315 { #name, offsetof(struct sfxge_rxq, member) } 1316#ifdef SFXGE_LRO 1317 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1318 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1319 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1320 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1321 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1322 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1323 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1324 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1325#endif 1326}; 1327 1328static int 1329sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1330{ 1331 struct sfxge_softc *sc = arg1; 1332 unsigned int id = arg2; 1333 unsigned int sum, index; 1334 1335 /* Sum across all RX queues */ 1336 sum = 0; 1337 for (index = 0; index < sc->rxq_count; index++) 1338 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1339 sfxge_rx_stats[id].offset); 1340 1341 return (SYSCTL_OUT(req, &sum, sizeof(sum))); 1342} 1343 1344static void 1345sfxge_rx_stat_init(struct sfxge_softc *sc) 1346{ 1347 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1348 struct sysctl_oid_list *stat_list; 1349 unsigned int id; 1350 1351 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1352 1353 for (id = 0; id < nitems(sfxge_rx_stats); id++) { 1354 SYSCTL_ADD_PROC( 1355 ctx, stat_list, 1356 OID_AUTO, sfxge_rx_stats[id].name, 1357 CTLTYPE_UINT|CTLFLAG_RD, 1358 sc, id, sfxge_rx_stat_handler, "IU", 1359 ""); 1360 } 1361} 1362 1363void 1364sfxge_rx_fini(struct sfxge_softc *sc) 1365{ 1366 int index; 1367 1368 index = sc->rxq_count; 1369 while (--index >= 0) 1370 sfxge_rx_qfini(sc, index); 1371 1372 sc->rxq_count = 0; 1373} 1374 1375int 1376sfxge_rx_init(struct sfxge_softc *sc) 1377{ 1378 struct sfxge_intr *intr; 1379 int index; 1380 int rc; 1381 1382#ifdef SFXGE_LRO 1383 if (!ISP2(lro_table_size)) { 1384 log(LOG_ERR, "%s=%u must be power of 2", 1385 SFXGE_LRO_PARAM(table_size), lro_table_size); 1386 rc = EINVAL; 1387 goto fail_lro_table_size; 1388 } 1389 1390 if (lro_idle_ticks == 0) 1391 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1392#endif 1393 1394 intr = &sc->intr; 1395 1396 sc->rxq_count = intr->n_alloc; 1397 1398 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1399 ("intr->state != SFXGE_INTR_INITIALIZED")); 1400 1401 /* Initialize the receive queue(s) - one per interrupt. */ 1402 for (index = 0; index < sc->rxq_count; index++) { 1403 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1404 goto fail; 1405 } 1406 1407 sfxge_rx_stat_init(sc); 1408 1409 return (0); 1410 1411fail: 1412 /* Tear down the receive queue(s). */ 1413 while (--index >= 0) 1414 sfxge_rx_qfini(sc, index); 1415 1416 sc->rxq_count = 0; 1417 1418#ifdef SFXGE_LRO 1419fail_lro_table_size: 1420#endif 1421 return (rc); 1422} 1423