1/*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2010-2016 Solarflare Communications Inc. 5 * All rights reserved. 6 * 7 * This software was developed in part by Philip Paeps under contract for 8 * Solarflare Communications, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright notice, 14 * this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright notice, 16 * this list of conditions and the following disclaimer in the documentation 17 * and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 23 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 26 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 27 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 28 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 29 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 * 31 * The views and conclusions contained in the software and documentation are 32 * those of the authors and should not be interpreted as representing official 33 * policies, either expressed or implied, of the FreeBSD Project. 34 */ 35 36#include <sys/cdefs.h> 37__FBSDID("$FreeBSD$"); 38 39#include "opt_rss.h" 40 41#include <sys/param.h> 42#include <sys/malloc.h> 43#include <sys/mbuf.h> 44#include <sys/smp.h> 45#include <sys/socket.h> 46#include <sys/sysctl.h> 47#include <sys/syslog.h> 48#include <sys/limits.h> 49#include <sys/syslog.h> 50 51#include <net/ethernet.h> 52#include <net/if.h> 53#include <net/if_vlan_var.h> 54 55#include <netinet/in.h> 56#include <netinet/ip.h> 57#include <netinet/ip6.h> 58#include <netinet/tcp.h> 59 60#include <machine/in_cksum.h> 61 62#ifdef RSS 63#include <net/rss_config.h> 64#endif 65 66#include "common/efx.h" 67 68 69#include "sfxge.h" 70#include "sfxge_rx.h" 71 72#define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10) 73 74#ifdef SFXGE_LRO 75 76SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL, 77 "Large receive offload (LRO) parameters"); 78 79#define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param) 80 81/* Size of the LRO hash table. Must be a power of 2. A larger table 82 * means we can accelerate a larger number of streams. 83 */ 84static unsigned lro_table_size = 128; 85TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size); 86SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN, 87 &lro_table_size, 0, 88 "Size of the LRO hash table (must be a power of 2)"); 89 90/* Maximum length of a hash chain. If chains get too long then the lookup 91 * time increases and may exceed the benefit of LRO. 92 */ 93static unsigned lro_chain_max = 20; 94TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max); 95SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN, 96 &lro_chain_max, 0, 97 "The maximum length of a hash chain"); 98 99/* Maximum time (in ticks) that a connection can be idle before it's LRO 100 * state is discarded. 101 */ 102static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 103TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks); 104SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN, 105 &lro_idle_ticks, 0, 106 "The maximum time (in ticks) that a connection can be idle " 107 "before it's LRO state is discarded"); 108 109/* Number of packets with payload that must arrive in-order before a 110 * connection is eligible for LRO. The idea is we should avoid coalescing 111 * segments when the sender is in slow-start because reducing the ACK rate 112 * can damage performance. 113 */ 114static int lro_slow_start_packets = 2000; 115TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets); 116SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN, 117 &lro_slow_start_packets, 0, 118 "Number of packets with payload that must arrive in-order before " 119 "a connection is eligible for LRO"); 120 121/* Number of packets with payload that must arrive in-order following loss 122 * before a connection is eligible for LRO. The idea is we should avoid 123 * coalescing segments when the sender is recovering from loss, because 124 * reducing the ACK rate can damage performance. 125 */ 126static int lro_loss_packets = 20; 127TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets); 128SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN, 129 &lro_loss_packets, 0, 130 "Number of packets with payload that must arrive in-order " 131 "following loss before a connection is eligible for LRO"); 132 133/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 134#define SFXGE_LRO_L2_ID_VLAN 0x4000 135#define SFXGE_LRO_L2_ID_IPV6 0x8000 136#define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 137#define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 138 139/* Compare IPv6 addresses, avoiding conditional branches */ 140static unsigned long ipv6_addr_cmp(const struct in6_addr *left, 141 const struct in6_addr *right) 142{ 143#if LONG_BIT == 64 144 const uint64_t *left64 = (const uint64_t *)left; 145 const uint64_t *right64 = (const uint64_t *)right; 146 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 147#else 148 return (left->s6_addr32[0] - right->s6_addr32[0]) | 149 (left->s6_addr32[1] - right->s6_addr32[1]) | 150 (left->s6_addr32[2] - right->s6_addr32[2]) | 151 (left->s6_addr32[3] - right->s6_addr32[3]); 152#endif 153} 154 155#endif /* SFXGE_LRO */ 156 157void 158sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 159{ 160 161 rxq->flush_state = SFXGE_FLUSH_DONE; 162} 163 164void 165sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 166{ 167 168 rxq->flush_state = SFXGE_FLUSH_FAILED; 169} 170 171#ifdef RSS 172static uint8_t toep_key[RSS_KEYSIZE]; 173#else 174static uint8_t toep_key[] = { 175 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 176 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 177 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 178 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 179 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 180}; 181#endif 182 183static void 184sfxge_rx_post_refill(void *arg) 185{ 186 struct sfxge_rxq *rxq = arg; 187 struct sfxge_softc *sc; 188 unsigned int index; 189 struct sfxge_evq *evq; 190 uint16_t magic; 191 192 sc = rxq->sc; 193 index = rxq->index; 194 evq = sc->evq[index]; 195 magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq); 196 197 /* This is guaranteed due to the start/stop order of rx and ev */ 198 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 199 ("evq not started")); 200 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 201 ("rxq not started")); 202 efx_ev_qpost(evq->common, magic); 203} 204 205static void 206sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 207{ 208 /* Initially retry after 100 ms, but back off in case of 209 * repeated failures as we probably have to wait for the 210 * administrator to raise the pool limit. */ 211 if (retrying) 212 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 213 else 214 rxq->refill_delay = hz / 10; 215 216 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 217 sfxge_rx_post_refill, rxq); 218} 219 220#define SFXGE_REFILL_BATCH 64 221 222static void 223sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 224{ 225 struct sfxge_softc *sc; 226 unsigned int index; 227 struct sfxge_evq *evq; 228 unsigned int batch; 229 unsigned int rxfill; 230 unsigned int mblksize; 231 int ntodo; 232 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 233 234 sc = rxq->sc; 235 index = rxq->index; 236 evq = sc->evq[index]; 237 238 prefetch_read_many(sc->enp); 239 prefetch_read_many(rxq->common); 240 241 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 242 243 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 244 return; 245 246 rxfill = rxq->added - rxq->completed; 247 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries), 248 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)")); 249 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target); 250 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries), 251 ("ntodo > EFX_RQX_LIMIT(rxq->entries)")); 252 253 if (ntodo == 0) 254 return; 255 256 batch = 0; 257 mblksize = sc->rx_buffer_size - sc->rx_buffer_align; 258 while (ntodo-- > 0) { 259 unsigned int id; 260 struct sfxge_rx_sw_desc *rx_desc; 261 bus_dma_segment_t seg; 262 struct mbuf *m; 263 264 id = (rxq->added + batch) & rxq->ptr_mask; 265 rx_desc = &rxq->queue[id]; 266 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 267 268 rx_desc->flags = EFX_DISCARD; 269 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, 270 sc->rx_cluster_size); 271 if (m == NULL) 272 break; 273 274 /* m_len specifies length of area to be mapped for DMA */ 275 m->m_len = mblksize; 276 m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data, 277 CACHE_LINE_SIZE); 278 m->m_data += sc->rx_buffer_align; 279 280 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 281 addr[batch++] = seg.ds_addr; 282 283 if (batch == SFXGE_REFILL_BATCH) { 284 efx_rx_qpost(rxq->common, addr, mblksize, batch, 285 rxq->completed, rxq->added); 286 rxq->added += batch; 287 batch = 0; 288 } 289 } 290 291 if (ntodo != 0) 292 sfxge_rx_schedule_refill(rxq, retrying); 293 294 if (batch != 0) { 295 efx_rx_qpost(rxq->common, addr, mblksize, batch, 296 rxq->completed, rxq->added); 297 rxq->added += batch; 298 } 299 300 /* Make the descriptors visible to the hardware */ 301 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 302 BUS_DMASYNC_PREWRITE); 303 304 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed); 305 306 /* The queue could still be empty if no descriptors were actually 307 * pushed, in which case there will be no event to cause the next 308 * refill, so we must schedule a refill ourselves. 309 */ 310 if(rxq->pushed == rxq->completed) { 311 sfxge_rx_schedule_refill(rxq, retrying); 312 } 313} 314 315void 316sfxge_rx_qrefill(struct sfxge_rxq *rxq) 317{ 318 319 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 320 return; 321 322 /* Make sure the queue is full */ 323 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE); 324} 325 326static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 327{ 328 struct ifnet *ifp = sc->ifnet; 329 330 m->m_pkthdr.rcvif = ifp; 331 m->m_pkthdr.csum_data = 0xffff; 332 ifp->if_input(ifp, m); 333} 334 335static void 336sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc) 337{ 338 struct sfxge_softc *sc = rxq->sc; 339 struct mbuf *m = rx_desc->mbuf; 340 int flags = rx_desc->flags; 341 int csum_flags; 342 343 /* Convert checksum flags */ 344 csum_flags = (flags & EFX_CKSUM_IPV4) ? 345 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 346 if (flags & EFX_CKSUM_TCPUDP) 347 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 348 349 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 350 m->m_pkthdr.flowid = 351 efx_pseudo_hdr_hash_get(rxq->common, 352 EFX_RX_HASHALG_TOEPLITZ, 353 mtod(m, uint8_t *)); 354 /* The hash covers a 4-tuple for TCP only */ 355 M_HASHTYPE_SET(m, 356 (flags & EFX_PKT_IPV4) ? 357 ((flags & EFX_PKT_TCP) ? 358 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) : 359 ((flags & EFX_PKT_TCP) ? 360 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6)); 361 } 362 m->m_data += sc->rx_prefix_size; 363 m->m_len = rx_desc->size - sc->rx_prefix_size; 364 m->m_pkthdr.len = m->m_len; 365 m->m_pkthdr.csum_flags = csum_flags; 366 __sfxge_rx_deliver(sc, rx_desc->mbuf); 367 368 rx_desc->flags = EFX_DISCARD; 369 rx_desc->mbuf = NULL; 370} 371 372#ifdef SFXGE_LRO 373 374static void 375sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 376{ 377 struct sfxge_softc *sc = st->sc; 378 struct mbuf *m = c->mbuf; 379 struct tcphdr *c_th; 380 int csum_flags; 381 382 KASSERT(m, ("no mbuf to deliver")); 383 384 ++st->n_bursts; 385 386 /* Finish off packet munging and recalculate IP header checksum. */ 387 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 388 struct ip *iph = c->nh; 389 iph->ip_len = htons(iph->ip_len); 390 iph->ip_sum = 0; 391 iph->ip_sum = in_cksum_hdr(iph); 392 c_th = (struct tcphdr *)(iph + 1); 393 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 394 CSUM_IP_CHECKED | CSUM_IP_VALID); 395 } else { 396 struct ip6_hdr *iph = c->nh; 397 iph->ip6_plen = htons(iph->ip6_plen); 398 c_th = (struct tcphdr *)(iph + 1); 399 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 400 } 401 402 c_th->th_win = c->th_last->th_win; 403 c_th->th_ack = c->th_last->th_ack; 404 if (c_th->th_off == c->th_last->th_off) { 405 /* Copy TCP options (take care to avoid going negative). */ 406 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 407 memcpy(c_th + 1, c->th_last + 1, optlen); 408 } 409 410 m->m_pkthdr.flowid = c->conn_hash; 411 M_HASHTYPE_SET(m, 412 SFXGE_LRO_CONN_IS_TCPIPV4(c) ? 413 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6); 414 415 m->m_pkthdr.csum_flags = csum_flags; 416 __sfxge_rx_deliver(sc, m); 417 418 c->mbuf = NULL; 419 c->delivered = 1; 420} 421 422/* Drop the given connection, and add it to the free list. */ 423static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 424{ 425 unsigned bucket; 426 427 KASSERT(!c->mbuf, ("found orphaned mbuf")); 428 429 if (c->next_buf.mbuf != NULL) { 430 sfxge_rx_deliver(rxq, &c->next_buf); 431 LIST_REMOVE(c, active_link); 432 } 433 434 bucket = c->conn_hash & rxq->lro.conns_mask; 435 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 436 --rxq->lro.conns_n[bucket]; 437 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 438 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 439} 440 441/* Stop tracking connections that have gone idle in order to keep hash 442 * chains short. 443 */ 444static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 445{ 446 struct sfxge_lro_conn *c; 447 unsigned i; 448 449 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 450 ("found active connections")); 451 452 rxq->lro.last_purge_ticks = now; 453 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 454 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 455 continue; 456 457 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 458 if (now - c->last_pkt_ticks > lro_idle_ticks) { 459 ++rxq->lro.n_drop_idle; 460 sfxge_lro_drop(rxq, c); 461 } 462 } 463} 464 465static void 466sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 467 struct mbuf *mbuf, struct tcphdr *th) 468{ 469 struct tcphdr *c_th; 470 471 /* Tack the new mbuf onto the chain. */ 472 KASSERT(!mbuf->m_next, ("mbuf already chained")); 473 c->mbuf_tail->m_next = mbuf; 474 c->mbuf_tail = mbuf; 475 476 /* Increase length appropriately */ 477 c->mbuf->m_pkthdr.len += mbuf->m_len; 478 479 /* Update the connection state flags */ 480 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 481 struct ip *iph = c->nh; 482 iph->ip_len += mbuf->m_len; 483 c_th = (struct tcphdr *)(iph + 1); 484 } else { 485 struct ip6_hdr *iph = c->nh; 486 iph->ip6_plen += mbuf->m_len; 487 c_th = (struct tcphdr *)(iph + 1); 488 } 489 c_th->th_flags |= (th->th_flags & TH_PUSH); 490 c->th_last = th; 491 ++st->n_merges; 492 493 /* Pass packet up now if another segment could overflow the IP 494 * length. 495 */ 496 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 497 sfxge_lro_deliver(st, c); 498} 499 500static void 501sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 502 struct mbuf *mbuf, void *nh, struct tcphdr *th) 503{ 504 /* Start the chain */ 505 c->mbuf = mbuf; 506 c->mbuf_tail = c->mbuf; 507 c->nh = nh; 508 c->th_last = th; 509 510 mbuf->m_pkthdr.len = mbuf->m_len; 511 512 /* Mangle header fields for later processing */ 513 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 514 struct ip *iph = nh; 515 iph->ip_len = ntohs(iph->ip_len); 516 } else { 517 struct ip6_hdr *iph = nh; 518 iph->ip6_plen = ntohs(iph->ip6_plen); 519 } 520} 521 522/* Try to merge or otherwise hold or deliver (as appropriate) the 523 * packet buffered for this connection (c->next_buf). Return a flag 524 * indicating whether the connection is still active for LRO purposes. 525 */ 526static int 527sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 528{ 529 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 530 char *eh = c->next_eh; 531 int data_length, hdr_length, dont_merge; 532 unsigned th_seq, pkt_length; 533 struct tcphdr *th; 534 unsigned now; 535 536 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 537 struct ip *iph = c->next_nh; 538 th = (struct tcphdr *)(iph + 1); 539 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 540 } else { 541 struct ip6_hdr *iph = c->next_nh; 542 th = (struct tcphdr *)(iph + 1); 543 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 544 } 545 546 hdr_length = (char *) th + th->th_off * 4 - eh; 547 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 548 hdr_length); 549 th_seq = ntohl(th->th_seq); 550 dont_merge = ((data_length <= 0) 551 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 552 553 /* Check for options other than aligned timestamp. */ 554 if (th->th_off != 5) { 555 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 556 if (th->th_off == 8 && 557 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 558 (TCPOPT_NOP << 16) | 559 (TCPOPT_TIMESTAMP << 8) | 560 TCPOLEN_TIMESTAMP)) { 561 /* timestamp option -- okay */ 562 } else { 563 dont_merge = 1; 564 } 565 } 566 567 if (__predict_false(th_seq != c->next_seq)) { 568 /* Out-of-order, so start counting again. */ 569 if (c->mbuf != NULL) 570 sfxge_lro_deliver(&rxq->lro, c); 571 c->n_in_order_pkts -= lro_loss_packets; 572 c->next_seq = th_seq + data_length; 573 ++rxq->lro.n_misorder; 574 goto deliver_buf_out; 575 } 576 c->next_seq = th_seq + data_length; 577 578 now = ticks; 579 if (now - c->last_pkt_ticks > lro_idle_ticks) { 580 ++rxq->lro.n_drop_idle; 581 if (c->mbuf != NULL) 582 sfxge_lro_deliver(&rxq->lro, c); 583 sfxge_lro_drop(rxq, c); 584 return (0); 585 } 586 c->last_pkt_ticks = ticks; 587 588 if (c->n_in_order_pkts < lro_slow_start_packets) { 589 /* May be in slow-start, so don't merge. */ 590 ++rxq->lro.n_slow_start; 591 ++c->n_in_order_pkts; 592 goto deliver_buf_out; 593 } 594 595 if (__predict_false(dont_merge)) { 596 if (c->mbuf != NULL) 597 sfxge_lro_deliver(&rxq->lro, c); 598 if (th->th_flags & (TH_FIN | TH_RST)) { 599 ++rxq->lro.n_drop_closed; 600 sfxge_lro_drop(rxq, c); 601 return (0); 602 } 603 goto deliver_buf_out; 604 } 605 606 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 607 608 if (__predict_true(c->mbuf != NULL)) { 609 /* Remove headers and any padding */ 610 rx_buf->mbuf->m_data += hdr_length; 611 rx_buf->mbuf->m_len = data_length; 612 613 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 614 } else { 615 /* Remove any padding */ 616 rx_buf->mbuf->m_len = pkt_length; 617 618 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 619 } 620 621 rx_buf->mbuf = NULL; 622 return (1); 623 624 deliver_buf_out: 625 sfxge_rx_deliver(rxq, rx_buf); 626 return (1); 627} 628 629static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 630 uint16_t l2_id, void *nh, struct tcphdr *th) 631{ 632 unsigned bucket = conn_hash & st->conns_mask; 633 struct sfxge_lro_conn *c; 634 635 if (st->conns_n[bucket] >= lro_chain_max) { 636 ++st->n_too_many; 637 return; 638 } 639 640 if (!TAILQ_EMPTY(&st->free_conns)) { 641 c = TAILQ_FIRST(&st->free_conns); 642 TAILQ_REMOVE(&st->free_conns, c, link); 643 } else { 644 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 645 if (c == NULL) 646 return; 647 c->mbuf = NULL; 648 c->next_buf.mbuf = NULL; 649 } 650 651 /* Create the connection tracking data */ 652 ++st->conns_n[bucket]; 653 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 654 c->l2_id = l2_id; 655 c->conn_hash = conn_hash; 656 c->source = th->th_sport; 657 c->dest = th->th_dport; 658 c->n_in_order_pkts = 0; 659 c->last_pkt_ticks = *(volatile int *)&ticks; 660 c->delivered = 0; 661 ++st->n_new_stream; 662 /* NB. We don't initialise c->next_seq, and it doesn't matter what 663 * value it has. Most likely the next packet received for this 664 * connection will not match -- no harm done. 665 */ 666} 667 668/* Process mbuf and decide whether to dispatch it to the stack now or 669 * later. 670 */ 671static void 672sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 673{ 674 struct sfxge_softc *sc = rxq->sc; 675 struct mbuf *m = rx_buf->mbuf; 676 struct ether_header *eh; 677 struct sfxge_lro_conn *c; 678 uint16_t l2_id; 679 uint16_t l3_proto; 680 void *nh; 681 struct tcphdr *th; 682 uint32_t conn_hash; 683 unsigned bucket; 684 685 /* Get the hardware hash */ 686 conn_hash = efx_pseudo_hdr_hash_get(rxq->common, 687 EFX_RX_HASHALG_TOEPLITZ, 688 mtod(m, uint8_t *)); 689 690 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 691 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 692 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 693 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 694 SFXGE_LRO_L2_ID_VLAN; 695 l3_proto = veh->evl_proto; 696 nh = veh + 1; 697 } else { 698 l2_id = 0; 699 l3_proto = eh->ether_type; 700 nh = eh + 1; 701 } 702 703 /* Check whether this is a suitable packet (unfragmented 704 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 705 * length, and compute a hash if necessary. If not, return. 706 */ 707 if (l3_proto == htons(ETHERTYPE_IP)) { 708 struct ip *iph = nh; 709 710 KASSERT(iph->ip_p == IPPROTO_TCP, 711 ("IPv4 protocol is not TCP, but packet marker is set")); 712 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) | 713 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 714 goto deliver_now; 715 th = (struct tcphdr *)(iph + 1); 716 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 717 struct ip6_hdr *iph = nh; 718 719 KASSERT(iph->ip6_nxt == IPPROTO_TCP, 720 ("IPv6 next header is not TCP, but packet marker is set")); 721 l2_id |= SFXGE_LRO_L2_ID_IPV6; 722 th = (struct tcphdr *)(iph + 1); 723 } else { 724 goto deliver_now; 725 } 726 727 bucket = conn_hash & rxq->lro.conns_mask; 728 729 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 730 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 731 continue; 732 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 733 continue; 734 if (c->mbuf != NULL) { 735 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 736 struct ip *c_iph, *iph = nh; 737 c_iph = c->nh; 738 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 739 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 740 continue; 741 } else { 742 struct ip6_hdr *c_iph, *iph = nh; 743 c_iph = c->nh; 744 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 745 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 746 continue; 747 } 748 } 749 750 /* Re-insert at head of list to reduce lookup time. */ 751 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 752 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 753 754 if (c->next_buf.mbuf != NULL) { 755 if (!sfxge_lro_try_merge(rxq, c)) 756 goto deliver_now; 757 } else { 758 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 759 active_link); 760 } 761 c->next_buf = *rx_buf; 762 c->next_eh = eh; 763 c->next_nh = nh; 764 765 rx_buf->mbuf = NULL; 766 rx_buf->flags = EFX_DISCARD; 767 return; 768 } 769 770 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 771 deliver_now: 772 sfxge_rx_deliver(rxq, rx_buf); 773} 774 775static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 776{ 777 struct sfxge_lro_state *st = &rxq->lro; 778 struct sfxge_lro_conn *c; 779 unsigned t; 780 781 while (!LIST_EMPTY(&st->active_conns)) { 782 c = LIST_FIRST(&st->active_conns); 783 if (!c->delivered && c->mbuf != NULL) 784 sfxge_lro_deliver(st, c); 785 if (sfxge_lro_try_merge(rxq, c)) { 786 if (c->mbuf != NULL) 787 sfxge_lro_deliver(st, c); 788 LIST_REMOVE(c, active_link); 789 } 790 c->delivered = 0; 791 } 792 793 t = *(volatile int *)&ticks; 794 if (__predict_false(t != st->last_purge_ticks)) 795 sfxge_lro_purge_idle(rxq, t); 796} 797 798#else /* !SFXGE_LRO */ 799 800static void 801sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 802{ 803} 804 805static void 806sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 807{ 808} 809 810#endif /* SFXGE_LRO */ 811 812void 813sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 814{ 815 struct sfxge_softc *sc = rxq->sc; 816 int if_capenable = sc->ifnet->if_capenable; 817 int lro_enabled = if_capenable & IFCAP_LRO; 818 unsigned int index; 819 struct sfxge_evq *evq; 820 unsigned int completed; 821 unsigned int level; 822 struct mbuf *m; 823 struct sfxge_rx_sw_desc *prev = NULL; 824 825 index = rxq->index; 826 evq = sc->evq[index]; 827 828 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 829 830 completed = rxq->completed; 831 while (completed != rxq->pending) { 832 unsigned int id; 833 struct sfxge_rx_sw_desc *rx_desc; 834 835 id = completed++ & rxq->ptr_mask; 836 rx_desc = &rxq->queue[id]; 837 m = rx_desc->mbuf; 838 839 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 840 goto discard; 841 842 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 843 goto discard; 844 845 /* Read the length from the pseudo header if required */ 846 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) { 847 uint16_t tmp_size; 848 int rc; 849 rc = efx_pseudo_hdr_pkt_length_get(rxq->common, 850 mtod(m, uint8_t *), 851 &tmp_size); 852 KASSERT(rc == 0, ("cannot get packet length: %d", rc)); 853 rx_desc->size = (int)tmp_size + sc->rx_prefix_size; 854 } 855 856 prefetch_read_many(mtod(m, caddr_t)); 857 858 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 859 case EFX_PKT_IPV4: 860 if (~if_capenable & IFCAP_RXCSUM) 861 rx_desc->flags &= 862 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP); 863 break; 864 case EFX_PKT_IPV6: 865 if (~if_capenable & IFCAP_RXCSUM_IPV6) 866 rx_desc->flags &= ~EFX_CKSUM_TCPUDP; 867 break; 868 case 0: 869 /* Check for loopback packets */ 870 { 871 struct ether_header *etherhp; 872 873 /*LINTED*/ 874 etherhp = mtod(m, struct ether_header *); 875 876 if (etherhp->ether_type == 877 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 878 EFSYS_PROBE(loopback); 879 880 rxq->loopback++; 881 goto discard; 882 } 883 } 884 break; 885 default: 886 KASSERT(B_FALSE, 887 ("Rx descriptor with both IPv4 and IPv6 flags")); 888 goto discard; 889 } 890 891 /* Pass packet up the stack or into LRO (pipelined) */ 892 if (prev != NULL) { 893 if (lro_enabled && 894 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 895 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 896 sfxge_lro(rxq, prev); 897 else 898 sfxge_rx_deliver(rxq, prev); 899 } 900 prev = rx_desc; 901 continue; 902 903discard: 904 /* Return the packet to the pool */ 905 m_free(m); 906 rx_desc->mbuf = NULL; 907 } 908 rxq->completed = completed; 909 910 level = rxq->added - rxq->completed; 911 912 /* Pass last packet up the stack or into LRO */ 913 if (prev != NULL) { 914 if (lro_enabled && 915 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 916 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 917 sfxge_lro(rxq, prev); 918 else 919 sfxge_rx_deliver(rxq, prev); 920 } 921 922 /* 923 * If there are any pending flows and this is the end of the 924 * poll then they must be completed. 925 */ 926 if (eop) 927 sfxge_lro_end_of_burst(rxq); 928 929 /* Top up the queue if necessary */ 930 if (level < rxq->refill_threshold) 931 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE); 932} 933 934static void 935sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 936{ 937 struct sfxge_rxq *rxq; 938 struct sfxge_evq *evq; 939 unsigned int count; 940 unsigned int retry = 3; 941 942 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 943 944 rxq = sc->rxq[index]; 945 evq = sc->evq[index]; 946 947 SFXGE_EVQ_LOCK(evq); 948 949 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 950 ("rxq not started")); 951 952 rxq->init_state = SFXGE_RXQ_INITIALIZED; 953 954 callout_stop(&rxq->refill_callout); 955 956 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) { 957 rxq->flush_state = SFXGE_FLUSH_PENDING; 958 959 SFXGE_EVQ_UNLOCK(evq); 960 961 /* Flush the receive queue */ 962 if (efx_rx_qflush(rxq->common) != 0) { 963 SFXGE_EVQ_LOCK(evq); 964 rxq->flush_state = SFXGE_FLUSH_FAILED; 965 break; 966 } 967 968 count = 0; 969 do { 970 /* Spin for 100 ms */ 971 DELAY(100000); 972 973 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 974 break; 975 976 } while (++count < 20); 977 978 SFXGE_EVQ_LOCK(evq); 979 980 if (rxq->flush_state == SFXGE_FLUSH_PENDING) { 981 /* Flush timeout - neither done nor failed */ 982 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n", 983 device_get_nameunit(sc->dev), index); 984 rxq->flush_state = SFXGE_FLUSH_DONE; 985 } 986 retry--; 987 } 988 if (rxq->flush_state == SFXGE_FLUSH_FAILED) { 989 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n", 990 device_get_nameunit(sc->dev), index); 991 rxq->flush_state = SFXGE_FLUSH_DONE; 992 } 993 994 rxq->pending = rxq->added; 995 sfxge_rx_qcomplete(rxq, B_TRUE); 996 997 KASSERT(rxq->completed == rxq->pending, 998 ("rxq->completed != rxq->pending")); 999 1000 rxq->added = 0; 1001 rxq->pushed = 0; 1002 rxq->pending = 0; 1003 rxq->completed = 0; 1004 rxq->loopback = 0; 1005 1006 /* Destroy the common code receive queue. */ 1007 efx_rx_qdestroy(rxq->common); 1008 1009 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1010 EFX_RXQ_NBUFS(sc->rxq_entries)); 1011 1012 SFXGE_EVQ_UNLOCK(evq); 1013} 1014 1015static int 1016sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 1017{ 1018 struct sfxge_rxq *rxq; 1019 efsys_mem_t *esmp; 1020 struct sfxge_evq *evq; 1021 int rc; 1022 1023 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 1024 1025 rxq = sc->rxq[index]; 1026 esmp = &rxq->mem; 1027 evq = sc->evq[index]; 1028 1029 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1030 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1031 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 1032 ("evq->init_state != SFXGE_EVQ_STARTED")); 1033 1034 /* Program the buffer table. */ 1035 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 1036 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0) 1037 return (rc); 1038 1039 /* Create the common code receive queue. */ 1040 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT, 1041 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common, 1042 &rxq->common)) != 0) 1043 goto fail; 1044 1045 SFXGE_EVQ_LOCK(evq); 1046 1047 /* Enable the receive queue. */ 1048 efx_rx_qenable(rxq->common); 1049 1050 rxq->init_state = SFXGE_RXQ_STARTED; 1051 rxq->flush_state = SFXGE_FLUSH_REQUIRED; 1052 1053 /* Try to fill the queue from the pool. */ 1054 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE); 1055 1056 SFXGE_EVQ_UNLOCK(evq); 1057 1058 return (0); 1059 1060fail: 1061 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1062 EFX_RXQ_NBUFS(sc->rxq_entries)); 1063 return (rc); 1064} 1065 1066void 1067sfxge_rx_stop(struct sfxge_softc *sc) 1068{ 1069 int index; 1070 1071 efx_mac_filter_default_rxq_clear(sc->enp); 1072 1073 /* Stop the receive queue(s) */ 1074 index = sc->rxq_count; 1075 while (--index >= 0) 1076 sfxge_rx_qstop(sc, index); 1077 1078 sc->rx_prefix_size = 0; 1079 sc->rx_buffer_size = 0; 1080 1081 efx_rx_fini(sc->enp); 1082} 1083 1084int 1085sfxge_rx_start(struct sfxge_softc *sc) 1086{ 1087 struct sfxge_intr *intr; 1088 const efx_nic_cfg_t *encp; 1089 size_t hdrlen, align, reserved; 1090 int index; 1091 int rc; 1092 1093 intr = &sc->intr; 1094 1095 /* Initialize the common code receive module. */ 1096 if ((rc = efx_rx_init(sc->enp)) != 0) 1097 return (rc); 1098 1099 encp = efx_nic_cfg_get(sc->enp); 1100 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu); 1101 1102 /* Calculate the receive packet buffer size. */ 1103 sc->rx_prefix_size = encp->enc_rx_prefix_size; 1104 1105 /* Ensure IP headers are 32bit aligned */ 1106 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header); 1107 sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen; 1108 1109 sc->rx_buffer_size += sc->rx_buffer_align; 1110 1111 /* Align end of packet buffer for RX DMA end padding */ 1112 align = MAX(1, encp->enc_rx_buf_align_end); 1113 EFSYS_ASSERT(ISP2(align)); 1114 sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align); 1115 1116 /* 1117 * Standard mbuf zones only guarantee pointer-size alignment; 1118 * we need extra space to align to the cache line 1119 */ 1120 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE; 1121 1122 /* Select zone for packet buffers */ 1123 if (reserved <= MCLBYTES) 1124 sc->rx_cluster_size = MCLBYTES; 1125 else if (reserved <= MJUMPAGESIZE) 1126 sc->rx_cluster_size = MJUMPAGESIZE; 1127 else if (reserved <= MJUM9BYTES) 1128 sc->rx_cluster_size = MJUM9BYTES; 1129 else 1130 sc->rx_cluster_size = MJUM16BYTES; 1131 1132 /* 1133 * Set up the scale table. Enable all hash types and hash insertion. 1134 */ 1135 for (index = 0; index < nitems(sc->rx_indir_table); index++) 1136#ifdef RSS 1137 sc->rx_indir_table[index] = 1138 rss_get_indirection_to_bucket(index) % sc->rxq_count; 1139#else 1140 sc->rx_indir_table[index] = index % sc->rxq_count; 1141#endif 1142 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table, 1143 nitems(sc->rx_indir_table))) != 0) 1144 goto fail; 1145 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ, 1146 EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 | 1147 EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE); 1148 1149#ifdef RSS 1150 rss_getkey(toep_key); 1151#endif 1152 if ((rc = efx_rx_scale_key_set(sc->enp, toep_key, 1153 sizeof(toep_key))) != 0) 1154 goto fail; 1155 1156 /* Start the receive queue(s). */ 1157 for (index = 0; index < sc->rxq_count; index++) { 1158 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1159 goto fail2; 1160 } 1161 1162 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common, 1163 sc->intr.n_alloc > 1); 1164 if (rc != 0) 1165 goto fail3; 1166 1167 return (0); 1168 1169fail3: 1170fail2: 1171 while (--index >= 0) 1172 sfxge_rx_qstop(sc, index); 1173 1174fail: 1175 efx_rx_fini(sc->enp); 1176 1177 return (rc); 1178} 1179 1180#ifdef SFXGE_LRO 1181 1182static void sfxge_lro_init(struct sfxge_rxq *rxq) 1183{ 1184 struct sfxge_lro_state *st = &rxq->lro; 1185 unsigned i; 1186 1187 st->conns_mask = lro_table_size - 1; 1188 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1189 ("lro_table_size must be a power of 2")); 1190 st->sc = rxq->sc; 1191 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1192 M_SFXGE, M_WAITOK); 1193 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1194 M_SFXGE, M_WAITOK); 1195 for (i = 0; i <= st->conns_mask; ++i) { 1196 TAILQ_INIT(&st->conns[i]); 1197 st->conns_n[i] = 0; 1198 } 1199 LIST_INIT(&st->active_conns); 1200 TAILQ_INIT(&st->free_conns); 1201} 1202 1203static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1204{ 1205 struct sfxge_lro_state *st = &rxq->lro; 1206 struct sfxge_lro_conn *c; 1207 unsigned i; 1208 1209 /* Return cleanly if sfxge_lro_init() has not been called. */ 1210 if (st->conns == NULL) 1211 return; 1212 1213 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1214 1215 for (i = 0; i <= st->conns_mask; ++i) { 1216 while (!TAILQ_EMPTY(&st->conns[i])) { 1217 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1218 sfxge_lro_drop(rxq, c); 1219 } 1220 } 1221 1222 while (!TAILQ_EMPTY(&st->free_conns)) { 1223 c = TAILQ_FIRST(&st->free_conns); 1224 TAILQ_REMOVE(&st->free_conns, c, link); 1225 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1226 free(c, M_SFXGE); 1227 } 1228 1229 free(st->conns_n, M_SFXGE); 1230 free(st->conns, M_SFXGE); 1231 st->conns = NULL; 1232} 1233 1234#else 1235 1236static void 1237sfxge_lro_init(struct sfxge_rxq *rxq) 1238{ 1239} 1240 1241static void 1242sfxge_lro_fini(struct sfxge_rxq *rxq) 1243{ 1244} 1245 1246#endif /* SFXGE_LRO */ 1247 1248static void 1249sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1250{ 1251 struct sfxge_rxq *rxq; 1252 1253 rxq = sc->rxq[index]; 1254 1255 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1256 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1257 1258 /* Free the context array and the flow table. */ 1259 free(rxq->queue, M_SFXGE); 1260 sfxge_lro_fini(rxq); 1261 1262 /* Release DMA memory. */ 1263 sfxge_dma_free(&rxq->mem); 1264 1265 sc->rxq[index] = NULL; 1266 1267 free(rxq, M_SFXGE); 1268} 1269 1270static int 1271sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1272{ 1273 struct sfxge_rxq *rxq; 1274 struct sfxge_evq *evq; 1275 efsys_mem_t *esmp; 1276 int rc; 1277 1278 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count)); 1279 1280 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1281 rxq->sc = sc; 1282 rxq->index = index; 1283 rxq->entries = sc->rxq_entries; 1284 rxq->ptr_mask = rxq->entries - 1; 1285 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries); 1286 1287 sc->rxq[index] = rxq; 1288 esmp = &rxq->mem; 1289 1290 evq = sc->evq[index]; 1291 1292 /* Allocate and zero DMA space. */ 1293 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0) 1294 return (rc); 1295 1296 /* Allocate buffer table entries. */ 1297 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries), 1298 &rxq->buf_base_id); 1299 1300 /* Allocate the context array and the flow table. */ 1301 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries, 1302 M_SFXGE, M_WAITOK | M_ZERO); 1303 sfxge_lro_init(rxq); 1304 1305 callout_init(&rxq->refill_callout, 1); 1306 1307 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1308 1309 return (0); 1310} 1311 1312static const struct { 1313 const char *name; 1314 size_t offset; 1315} sfxge_rx_stats[] = { 1316#define SFXGE_RX_STAT(name, member) \ 1317 { #name, offsetof(struct sfxge_rxq, member) } 1318#ifdef SFXGE_LRO 1319 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1320 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1321 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1322 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1323 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1324 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1325 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1326 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1327#endif 1328}; 1329 1330static int 1331sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1332{ 1333 struct sfxge_softc *sc = arg1; 1334 unsigned int id = arg2; 1335 unsigned int sum, index; 1336 1337 /* Sum across all RX queues */ 1338 sum = 0; 1339 for (index = 0; index < sc->rxq_count; index++) 1340 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1341 sfxge_rx_stats[id].offset); 1342 1343 return (SYSCTL_OUT(req, &sum, sizeof(sum))); 1344} 1345 1346static void 1347sfxge_rx_stat_init(struct sfxge_softc *sc) 1348{ 1349 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1350 struct sysctl_oid_list *stat_list; 1351 unsigned int id; 1352 1353 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1354 1355 for (id = 0; id < nitems(sfxge_rx_stats); id++) { 1356 SYSCTL_ADD_PROC( 1357 ctx, stat_list, 1358 OID_AUTO, sfxge_rx_stats[id].name, 1359 CTLTYPE_UINT|CTLFLAG_RD, 1360 sc, id, sfxge_rx_stat_handler, "IU", 1361 ""); 1362 } 1363} 1364 1365void 1366sfxge_rx_fini(struct sfxge_softc *sc) 1367{ 1368 int index; 1369 1370 index = sc->rxq_count; 1371 while (--index >= 0) 1372 sfxge_rx_qfini(sc, index); 1373 1374 sc->rxq_count = 0; 1375} 1376 1377int 1378sfxge_rx_init(struct sfxge_softc *sc) 1379{ 1380 struct sfxge_intr *intr; 1381 int index; 1382 int rc; 1383 1384#ifdef SFXGE_LRO 1385 if (!ISP2(lro_table_size)) { 1386 log(LOG_ERR, "%s=%u must be power of 2", 1387 SFXGE_LRO_PARAM(table_size), lro_table_size); 1388 rc = EINVAL; 1389 goto fail_lro_table_size; 1390 } 1391 1392 if (lro_idle_ticks == 0) 1393 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1394#endif 1395 1396 intr = &sc->intr; 1397 1398 sc->rxq_count = intr->n_alloc; 1399 1400 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1401 ("intr->state != SFXGE_INTR_INITIALIZED")); 1402 1403 /* Initialize the receive queue(s) - one per interrupt. */ 1404 for (index = 0; index < sc->rxq_count; index++) { 1405 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1406 goto fail; 1407 } 1408 1409 sfxge_rx_stat_init(sc); 1410 1411 return (0); 1412 1413fail: 1414 /* Tear down the receive queue(s). */ 1415 while (--index >= 0) 1416 sfxge_rx_qfini(sc, index); 1417 1418 sc->rxq_count = 0; 1419 1420#ifdef SFXGE_LRO 1421fail_lro_table_size: 1422#endif 1423 return (rc); 1424} 1425