sfxge_rx.c revision 301065
1/*- 2 * Copyright (c) 2010-2016 Solarflare Communications Inc. 3 * All rights reserved. 4 * 5 * This software was developed in part by Philip Paeps under contract for 6 * Solarflare Communications, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright notice, 12 * this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 24 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 26 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 * 29 * The views and conclusions contained in the software and documentation are 30 * those of the authors and should not be interpreted as representing official 31 * policies, either expressed or implied, of the FreeBSD Project. 32 */ 33 34#include <sys/cdefs.h> 35__FBSDID("$FreeBSD: head/sys/dev/sfxge/sfxge_rx.c 301065 2016-05-31 18:31:17Z arybchik $"); 36 37#include <sys/param.h> 38#include <sys/malloc.h> 39#include <sys/mbuf.h> 40#include <sys/smp.h> 41#include <sys/socket.h> 42#include <sys/sysctl.h> 43#include <sys/syslog.h> 44#include <sys/limits.h> 45#include <sys/syslog.h> 46 47#include <net/ethernet.h> 48#include <net/if.h> 49#include <net/if_vlan_var.h> 50 51#include <netinet/in.h> 52#include <netinet/ip.h> 53#include <netinet/ip6.h> 54#include <netinet/tcp.h> 55 56#include <machine/in_cksum.h> 57 58#include "common/efx.h" 59 60 61#include "sfxge.h" 62#include "sfxge_rx.h" 63 64#define RX_REFILL_THRESHOLD(_entries) (EFX_RXQ_LIMIT(_entries) * 9 / 10) 65 66#ifdef SFXGE_LRO 67 68SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL, 69 "Large receive offload (LRO) parameters"); 70 71#define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param) 72 73/* Size of the LRO hash table. Must be a power of 2. A larger table 74 * means we can accelerate a larger number of streams. 75 */ 76static unsigned lro_table_size = 128; 77TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size); 78SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN, 79 &lro_table_size, 0, 80 "Size of the LRO hash table (must be a power of 2)"); 81 82/* Maximum length of a hash chain. If chains get too long then the lookup 83 * time increases and may exceed the benefit of LRO. 84 */ 85static unsigned lro_chain_max = 20; 86TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max); 87SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN, 88 &lro_chain_max, 0, 89 "The maximum length of a hash chain"); 90 91/* Maximum time (in ticks) that a connection can be idle before it's LRO 92 * state is discarded. 93 */ 94static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */ 95TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks); 96SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN, 97 &lro_idle_ticks, 0, 98 "The maximum time (in ticks) that a connection can be idle " 99 "before it's LRO state is discarded"); 100 101/* Number of packets with payload that must arrive in-order before a 102 * connection is eligible for LRO. The idea is we should avoid coalescing 103 * segments when the sender is in slow-start because reducing the ACK rate 104 * can damage performance. 105 */ 106static int lro_slow_start_packets = 2000; 107TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets); 108SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN, 109 &lro_slow_start_packets, 0, 110 "Number of packets with payload that must arrive in-order before " 111 "a connection is eligible for LRO"); 112 113/* Number of packets with payload that must arrive in-order following loss 114 * before a connection is eligible for LRO. The idea is we should avoid 115 * coalescing segments when the sender is recovering from loss, because 116 * reducing the ACK rate can damage performance. 117 */ 118static int lro_loss_packets = 20; 119TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets); 120SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN, 121 &lro_loss_packets, 0, 122 "Number of packets with payload that must arrive in-order " 123 "following loss before a connection is eligible for LRO"); 124 125/* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */ 126#define SFXGE_LRO_L2_ID_VLAN 0x4000 127#define SFXGE_LRO_L2_ID_IPV6 0x8000 128#define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN) 129#define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6)) 130 131/* Compare IPv6 addresses, avoiding conditional branches */ 132static unsigned long ipv6_addr_cmp(const struct in6_addr *left, 133 const struct in6_addr *right) 134{ 135#if LONG_BIT == 64 136 const uint64_t *left64 = (const uint64_t *)left; 137 const uint64_t *right64 = (const uint64_t *)right; 138 return (left64[0] - right64[0]) | (left64[1] - right64[1]); 139#else 140 return (left->s6_addr32[0] - right->s6_addr32[0]) | 141 (left->s6_addr32[1] - right->s6_addr32[1]) | 142 (left->s6_addr32[2] - right->s6_addr32[2]) | 143 (left->s6_addr32[3] - right->s6_addr32[3]); 144#endif 145} 146 147#endif /* SFXGE_LRO */ 148 149void 150sfxge_rx_qflush_done(struct sfxge_rxq *rxq) 151{ 152 153 rxq->flush_state = SFXGE_FLUSH_DONE; 154} 155 156void 157sfxge_rx_qflush_failed(struct sfxge_rxq *rxq) 158{ 159 160 rxq->flush_state = SFXGE_FLUSH_FAILED; 161} 162 163static uint8_t toep_key[] = { 164 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 165 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 166 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 167 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 168 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 169}; 170 171static void 172sfxge_rx_post_refill(void *arg) 173{ 174 struct sfxge_rxq *rxq = arg; 175 struct sfxge_softc *sc; 176 unsigned int index; 177 struct sfxge_evq *evq; 178 unsigned int label; 179 uint16_t magic; 180 181 sc = rxq->sc; 182 index = rxq->index; 183 evq = sc->evq[index]; 184 185 label = 0; 186 KASSERT((label & SFXGE_MAGIC_DMAQ_LABEL_MASK) == label, 187 ("(label & SFXGE_MAGIC_DMAQ_LABEL_MASK) != level")); 188 magic = SFXGE_MAGIC_RX_QREFILL | label; 189 190 /* This is guaranteed due to the start/stop order of rx and ev */ 191 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 192 ("evq not started")); 193 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 194 ("rxq not started")); 195 efx_ev_qpost(evq->common, magic); 196} 197 198static void 199sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying) 200{ 201 /* Initially retry after 100 ms, but back off in case of 202 * repeated failures as we probably have to wait for the 203 * administrator to raise the pool limit. */ 204 if (retrying) 205 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz); 206 else 207 rxq->refill_delay = hz / 10; 208 209 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay, 210 sfxge_rx_post_refill, rxq); 211} 212 213#define SFXGE_REFILL_BATCH 64 214 215static void 216sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying) 217{ 218 struct sfxge_softc *sc; 219 unsigned int index; 220 struct sfxge_evq *evq; 221 unsigned int batch; 222 unsigned int rxfill; 223 unsigned int mblksize; 224 int ntodo; 225 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH]; 226 227 sc = rxq->sc; 228 index = rxq->index; 229 evq = sc->evq[index]; 230 231 prefetch_read_many(sc->enp); 232 prefetch_read_many(rxq->common); 233 234 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 235 236 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 237 return; 238 239 rxfill = rxq->added - rxq->completed; 240 KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries), 241 ("rxfill > EFX_RXQ_LIMIT(rxq->entries)")); 242 ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target); 243 KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries), 244 ("ntodo > EFX_RQX_LIMIT(rxq->entries)")); 245 246 if (ntodo == 0) 247 return; 248 249 batch = 0; 250 mblksize = sc->rx_buffer_size - sc->rx_buffer_align; 251 while (ntodo-- > 0) { 252 unsigned int id; 253 struct sfxge_rx_sw_desc *rx_desc; 254 bus_dma_segment_t seg; 255 struct mbuf *m; 256 257 id = (rxq->added + batch) & rxq->ptr_mask; 258 rx_desc = &rxq->queue[id]; 259 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL")); 260 261 rx_desc->flags = EFX_DISCARD; 262 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, 263 sc->rx_cluster_size); 264 if (m == NULL) 265 break; 266 267 /* m_len specifies length of area to be mapped for DMA */ 268 m->m_len = mblksize; 269 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE); 270 m->m_data += sc->rx_buffer_align; 271 272 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg); 273 addr[batch++] = seg.ds_addr; 274 275 if (batch == SFXGE_REFILL_BATCH) { 276 efx_rx_qpost(rxq->common, addr, mblksize, batch, 277 rxq->completed, rxq->added); 278 rxq->added += batch; 279 batch = 0; 280 } 281 } 282 283 if (ntodo != 0) 284 sfxge_rx_schedule_refill(rxq, retrying); 285 286 if (batch != 0) { 287 efx_rx_qpost(rxq->common, addr, mblksize, batch, 288 rxq->completed, rxq->added); 289 rxq->added += batch; 290 } 291 292 /* Make the descriptors visible to the hardware */ 293 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map, 294 BUS_DMASYNC_PREWRITE); 295 296 efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed); 297 298 /* The queue could still be empty if no descriptors were actually 299 * pushed, in which case there will be no event to cause the next 300 * refill, so we must schedule a refill ourselves. 301 */ 302 if(rxq->pushed == rxq->completed) { 303 sfxge_rx_schedule_refill(rxq, retrying); 304 } 305} 306 307void 308sfxge_rx_qrefill(struct sfxge_rxq *rxq) 309{ 310 311 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 312 return; 313 314 /* Make sure the queue is full */ 315 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE); 316} 317 318static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m) 319{ 320 struct ifnet *ifp = sc->ifnet; 321 322 m->m_pkthdr.rcvif = ifp; 323 m->m_pkthdr.csum_data = 0xffff; 324 ifp->if_input(ifp, m); 325} 326 327static void 328sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc) 329{ 330 struct mbuf *m = rx_desc->mbuf; 331 int flags = rx_desc->flags; 332 int csum_flags; 333 334 /* Convert checksum flags */ 335 csum_flags = (flags & EFX_CKSUM_IPV4) ? 336 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0; 337 if (flags & EFX_CKSUM_TCPUDP) 338 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 339 340 if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 341 m->m_pkthdr.flowid = 342 efx_psuedo_hdr_hash_get(sc->enp, 343 EFX_RX_HASHALG_TOEPLITZ, 344 mtod(m, uint8_t *)); 345 /* The hash covers a 4-tuple for TCP only */ 346 M_HASHTYPE_SET(m, 347 (flags & EFX_PKT_IPV4) ? 348 ((flags & EFX_PKT_TCP) ? 349 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) : 350 ((flags & EFX_PKT_TCP) ? 351 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6)); 352 } 353 m->m_data += sc->rx_prefix_size; 354 m->m_len = rx_desc->size - sc->rx_prefix_size; 355 m->m_pkthdr.len = m->m_len; 356 m->m_pkthdr.csum_flags = csum_flags; 357 __sfxge_rx_deliver(sc, rx_desc->mbuf); 358 359 rx_desc->flags = EFX_DISCARD; 360 rx_desc->mbuf = NULL; 361} 362 363#ifdef SFXGE_LRO 364 365static void 366sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c) 367{ 368 struct sfxge_softc *sc = st->sc; 369 struct mbuf *m = c->mbuf; 370 struct tcphdr *c_th; 371 int csum_flags; 372 373 KASSERT(m, ("no mbuf to deliver")); 374 375 ++st->n_bursts; 376 377 /* Finish off packet munging and recalculate IP header checksum. */ 378 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 379 struct ip *iph = c->nh; 380 iph->ip_len = htons(iph->ip_len); 381 iph->ip_sum = 0; 382 iph->ip_sum = in_cksum_hdr(iph); 383 c_th = (struct tcphdr *)(iph + 1); 384 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR | 385 CSUM_IP_CHECKED | CSUM_IP_VALID); 386 } else { 387 struct ip6_hdr *iph = c->nh; 388 iph->ip6_plen = htons(iph->ip6_plen); 389 c_th = (struct tcphdr *)(iph + 1); 390 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; 391 } 392 393 c_th->th_win = c->th_last->th_win; 394 c_th->th_ack = c->th_last->th_ack; 395 if (c_th->th_off == c->th_last->th_off) { 396 /* Copy TCP options (take care to avoid going negative). */ 397 int optlen = ((c_th->th_off - 5) & 0xf) << 2u; 398 memcpy(c_th + 1, c->th_last + 1, optlen); 399 } 400 401 m->m_pkthdr.flowid = c->conn_hash; 402 M_HASHTYPE_SET(m, 403 SFXGE_LRO_CONN_IS_TCPIPV4(c) ? 404 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6); 405 406 m->m_pkthdr.csum_flags = csum_flags; 407 __sfxge_rx_deliver(sc, m); 408 409 c->mbuf = NULL; 410 c->delivered = 1; 411} 412 413/* Drop the given connection, and add it to the free list. */ 414static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 415{ 416 unsigned bucket; 417 418 KASSERT(!c->mbuf, ("found orphaned mbuf")); 419 420 if (c->next_buf.mbuf != NULL) { 421 sfxge_rx_deliver(rxq->sc, &c->next_buf); 422 LIST_REMOVE(c, active_link); 423 } 424 425 bucket = c->conn_hash & rxq->lro.conns_mask; 426 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong")); 427 --rxq->lro.conns_n[bucket]; 428 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 429 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link); 430} 431 432/* Stop tracking connections that have gone idle in order to keep hash 433 * chains short. 434 */ 435static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now) 436{ 437 struct sfxge_lro_conn *c; 438 unsigned i; 439 440 KASSERT(LIST_EMPTY(&rxq->lro.active_conns), 441 ("found active connections")); 442 443 rxq->lro.last_purge_ticks = now; 444 for (i = 0; i <= rxq->lro.conns_mask; ++i) { 445 if (TAILQ_EMPTY(&rxq->lro.conns[i])) 446 continue; 447 448 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq); 449 if (now - c->last_pkt_ticks > lro_idle_ticks) { 450 ++rxq->lro.n_drop_idle; 451 sfxge_lro_drop(rxq, c); 452 } 453 } 454} 455 456static void 457sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 458 struct mbuf *mbuf, struct tcphdr *th) 459{ 460 struct tcphdr *c_th; 461 462 /* Tack the new mbuf onto the chain. */ 463 KASSERT(!mbuf->m_next, ("mbuf already chained")); 464 c->mbuf_tail->m_next = mbuf; 465 c->mbuf_tail = mbuf; 466 467 /* Increase length appropriately */ 468 c->mbuf->m_pkthdr.len += mbuf->m_len; 469 470 /* Update the connection state flags */ 471 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 472 struct ip *iph = c->nh; 473 iph->ip_len += mbuf->m_len; 474 c_th = (struct tcphdr *)(iph + 1); 475 } else { 476 struct ip6_hdr *iph = c->nh; 477 iph->ip6_plen += mbuf->m_len; 478 c_th = (struct tcphdr *)(iph + 1); 479 } 480 c_th->th_flags |= (th->th_flags & TH_PUSH); 481 c->th_last = th; 482 ++st->n_merges; 483 484 /* Pass packet up now if another segment could overflow the IP 485 * length. 486 */ 487 if (c->mbuf->m_pkthdr.len > 65536 - 9200) 488 sfxge_lro_deliver(st, c); 489} 490 491static void 492sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c, 493 struct mbuf *mbuf, void *nh, struct tcphdr *th) 494{ 495 /* Start the chain */ 496 c->mbuf = mbuf; 497 c->mbuf_tail = c->mbuf; 498 c->nh = nh; 499 c->th_last = th; 500 501 mbuf->m_pkthdr.len = mbuf->m_len; 502 503 /* Mangle header fields for later processing */ 504 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 505 struct ip *iph = nh; 506 iph->ip_len = ntohs(iph->ip_len); 507 } else { 508 struct ip6_hdr *iph = nh; 509 iph->ip6_plen = ntohs(iph->ip6_plen); 510 } 511} 512 513/* Try to merge or otherwise hold or deliver (as appropriate) the 514 * packet buffered for this connection (c->next_buf). Return a flag 515 * indicating whether the connection is still active for LRO purposes. 516 */ 517static int 518sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c) 519{ 520 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf; 521 char *eh = c->next_eh; 522 int data_length, hdr_length, dont_merge; 523 unsigned th_seq, pkt_length; 524 struct tcphdr *th; 525 unsigned now; 526 527 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 528 struct ip *iph = c->next_nh; 529 th = (struct tcphdr *)(iph + 1); 530 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh; 531 } else { 532 struct ip6_hdr *iph = c->next_nh; 533 th = (struct tcphdr *)(iph + 1); 534 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh; 535 } 536 537 hdr_length = (char *) th + th->th_off * 4 - eh; 538 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) - 539 hdr_length); 540 th_seq = ntohl(th->th_seq); 541 dont_merge = ((data_length <= 0) 542 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN))); 543 544 /* Check for options other than aligned timestamp. */ 545 if (th->th_off != 5) { 546 const uint32_t *opt_ptr = (const uint32_t *) (th + 1); 547 if (th->th_off == 8 && 548 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) | 549 (TCPOPT_NOP << 16) | 550 (TCPOPT_TIMESTAMP << 8) | 551 TCPOLEN_TIMESTAMP)) { 552 /* timestamp option -- okay */ 553 } else { 554 dont_merge = 1; 555 } 556 } 557 558 if (__predict_false(th_seq != c->next_seq)) { 559 /* Out-of-order, so start counting again. */ 560 if (c->mbuf != NULL) 561 sfxge_lro_deliver(&rxq->lro, c); 562 c->n_in_order_pkts -= lro_loss_packets; 563 c->next_seq = th_seq + data_length; 564 ++rxq->lro.n_misorder; 565 goto deliver_buf_out; 566 } 567 c->next_seq = th_seq + data_length; 568 569 now = ticks; 570 if (now - c->last_pkt_ticks > lro_idle_ticks) { 571 ++rxq->lro.n_drop_idle; 572 if (c->mbuf != NULL) 573 sfxge_lro_deliver(&rxq->lro, c); 574 sfxge_lro_drop(rxq, c); 575 return (0); 576 } 577 c->last_pkt_ticks = ticks; 578 579 if (c->n_in_order_pkts < lro_slow_start_packets) { 580 /* May be in slow-start, so don't merge. */ 581 ++rxq->lro.n_slow_start; 582 ++c->n_in_order_pkts; 583 goto deliver_buf_out; 584 } 585 586 if (__predict_false(dont_merge)) { 587 if (c->mbuf != NULL) 588 sfxge_lro_deliver(&rxq->lro, c); 589 if (th->th_flags & (TH_FIN | TH_RST)) { 590 ++rxq->lro.n_drop_closed; 591 sfxge_lro_drop(rxq, c); 592 return (0); 593 } 594 goto deliver_buf_out; 595 } 596 597 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size; 598 599 if (__predict_true(c->mbuf != NULL)) { 600 /* Remove headers and any padding */ 601 rx_buf->mbuf->m_data += hdr_length; 602 rx_buf->mbuf->m_len = data_length; 603 604 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th); 605 } else { 606 /* Remove any padding */ 607 rx_buf->mbuf->m_len = pkt_length; 608 609 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th); 610 } 611 612 rx_buf->mbuf = NULL; 613 return (1); 614 615 deliver_buf_out: 616 sfxge_rx_deliver(rxq->sc, rx_buf); 617 return (1); 618} 619 620static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash, 621 uint16_t l2_id, void *nh, struct tcphdr *th) 622{ 623 unsigned bucket = conn_hash & st->conns_mask; 624 struct sfxge_lro_conn *c; 625 626 if (st->conns_n[bucket] >= lro_chain_max) { 627 ++st->n_too_many; 628 return; 629 } 630 631 if (!TAILQ_EMPTY(&st->free_conns)) { 632 c = TAILQ_FIRST(&st->free_conns); 633 TAILQ_REMOVE(&st->free_conns, c, link); 634 } else { 635 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT); 636 if (c == NULL) 637 return; 638 c->mbuf = NULL; 639 c->next_buf.mbuf = NULL; 640 } 641 642 /* Create the connection tracking data */ 643 ++st->conns_n[bucket]; 644 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link); 645 c->l2_id = l2_id; 646 c->conn_hash = conn_hash; 647 c->source = th->th_sport; 648 c->dest = th->th_dport; 649 c->n_in_order_pkts = 0; 650 c->last_pkt_ticks = *(volatile int *)&ticks; 651 c->delivered = 0; 652 ++st->n_new_stream; 653 /* NB. We don't initialise c->next_seq, and it doesn't matter what 654 * value it has. Most likely the next packet received for this 655 * connection will not match -- no harm done. 656 */ 657} 658 659/* Process mbuf and decide whether to dispatch it to the stack now or 660 * later. 661 */ 662static void 663sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 664{ 665 struct sfxge_softc *sc = rxq->sc; 666 struct mbuf *m = rx_buf->mbuf; 667 struct ether_header *eh; 668 struct sfxge_lro_conn *c; 669 uint16_t l2_id; 670 uint16_t l3_proto; 671 void *nh; 672 struct tcphdr *th; 673 uint32_t conn_hash; 674 unsigned bucket; 675 676 /* Get the hardware hash */ 677 conn_hash = efx_psuedo_hdr_hash_get(sc->enp, 678 EFX_RX_HASHALG_TOEPLITZ, 679 mtod(m, uint8_t *)); 680 681 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size); 682 if (eh->ether_type == htons(ETHERTYPE_VLAN)) { 683 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh; 684 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) | 685 SFXGE_LRO_L2_ID_VLAN; 686 l3_proto = veh->evl_proto; 687 nh = veh + 1; 688 } else { 689 l2_id = 0; 690 l3_proto = eh->ether_type; 691 nh = eh + 1; 692 } 693 694 /* Check whether this is a suitable packet (unfragmented 695 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and 696 * length, and compute a hash if necessary. If not, return. 697 */ 698 if (l3_proto == htons(ETHERTYPE_IP)) { 699 struct ip *iph = nh; 700 701 KASSERT(iph->ip_p == IPPROTO_TCP, 702 ("IPv4 protocol is not TCP, but packet marker is set")); 703 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) | 704 (iph->ip_off & htons(IP_MF | IP_OFFMASK))) 705 goto deliver_now; 706 th = (struct tcphdr *)(iph + 1); 707 } else if (l3_proto == htons(ETHERTYPE_IPV6)) { 708 struct ip6_hdr *iph = nh; 709 710 KASSERT(iph->ip6_nxt == IPPROTO_TCP, 711 ("IPv6 next header is not TCP, but packet marker is set")); 712 l2_id |= SFXGE_LRO_L2_ID_IPV6; 713 th = (struct tcphdr *)(iph + 1); 714 } else { 715 goto deliver_now; 716 } 717 718 bucket = conn_hash & rxq->lro.conns_mask; 719 720 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) { 721 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash)) 722 continue; 723 if ((c->source - th->th_sport) | (c->dest - th->th_dport)) 724 continue; 725 if (c->mbuf != NULL) { 726 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) { 727 struct ip *c_iph, *iph = nh; 728 c_iph = c->nh; 729 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) | 730 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr)) 731 continue; 732 } else { 733 struct ip6_hdr *c_iph, *iph = nh; 734 c_iph = c->nh; 735 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) | 736 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst)) 737 continue; 738 } 739 } 740 741 /* Re-insert at head of list to reduce lookup time. */ 742 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link); 743 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link); 744 745 if (c->next_buf.mbuf != NULL) { 746 if (!sfxge_lro_try_merge(rxq, c)) 747 goto deliver_now; 748 } else { 749 LIST_INSERT_HEAD(&rxq->lro.active_conns, c, 750 active_link); 751 } 752 c->next_buf = *rx_buf; 753 c->next_eh = eh; 754 c->next_nh = nh; 755 756 rx_buf->mbuf = NULL; 757 rx_buf->flags = EFX_DISCARD; 758 return; 759 } 760 761 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th); 762 deliver_now: 763 sfxge_rx_deliver(sc, rx_buf); 764} 765 766static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 767{ 768 struct sfxge_lro_state *st = &rxq->lro; 769 struct sfxge_lro_conn *c; 770 unsigned t; 771 772 while (!LIST_EMPTY(&st->active_conns)) { 773 c = LIST_FIRST(&st->active_conns); 774 if (!c->delivered && c->mbuf != NULL) 775 sfxge_lro_deliver(st, c); 776 if (sfxge_lro_try_merge(rxq, c)) { 777 if (c->mbuf != NULL) 778 sfxge_lro_deliver(st, c); 779 LIST_REMOVE(c, active_link); 780 } 781 c->delivered = 0; 782 } 783 784 t = *(volatile int *)&ticks; 785 if (__predict_false(t != st->last_purge_ticks)) 786 sfxge_lro_purge_idle(rxq, t); 787} 788 789#else /* !SFXGE_LRO */ 790 791static void 792sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf) 793{ 794} 795 796static void 797sfxge_lro_end_of_burst(struct sfxge_rxq *rxq) 798{ 799} 800 801#endif /* SFXGE_LRO */ 802 803void 804sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop) 805{ 806 struct sfxge_softc *sc = rxq->sc; 807 int if_capenable = sc->ifnet->if_capenable; 808 int lro_enabled = if_capenable & IFCAP_LRO; 809 unsigned int index; 810 struct sfxge_evq *evq; 811 unsigned int completed; 812 unsigned int level; 813 struct mbuf *m; 814 struct sfxge_rx_sw_desc *prev = NULL; 815 816 index = rxq->index; 817 evq = sc->evq[index]; 818 819 SFXGE_EVQ_LOCK_ASSERT_OWNED(evq); 820 821 completed = rxq->completed; 822 while (completed != rxq->pending) { 823 unsigned int id; 824 struct sfxge_rx_sw_desc *rx_desc; 825 826 id = completed++ & rxq->ptr_mask; 827 rx_desc = &rxq->queue[id]; 828 m = rx_desc->mbuf; 829 830 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED)) 831 goto discard; 832 833 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD)) 834 goto discard; 835 836 /* Read the length from the pseudo header if required */ 837 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) { 838 uint16_t tmp_size; 839 int rc; 840 rc = efx_psuedo_hdr_pkt_length_get(sc->enp, 841 mtod(m, uint8_t *), 842 &tmp_size); 843 KASSERT(rc == 0, ("cannot get packet length: %d", rc)); 844 rx_desc->size = (int)tmp_size + sc->rx_prefix_size; 845 } 846 847 prefetch_read_many(mtod(m, caddr_t)); 848 849 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) { 850 case EFX_PKT_IPV4: 851 if (~if_capenable & IFCAP_RXCSUM) 852 rx_desc->flags &= 853 ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP); 854 break; 855 case EFX_PKT_IPV6: 856 if (~if_capenable & IFCAP_RXCSUM_IPV6) 857 rx_desc->flags &= ~EFX_CKSUM_TCPUDP; 858 break; 859 case 0: 860 /* Check for loopback packets */ 861 { 862 struct ether_header *etherhp; 863 864 /*LINTED*/ 865 etherhp = mtod(m, struct ether_header *); 866 867 if (etherhp->ether_type == 868 htons(SFXGE_ETHERTYPE_LOOPBACK)) { 869 EFSYS_PROBE(loopback); 870 871 rxq->loopback++; 872 goto discard; 873 } 874 } 875 break; 876 default: 877 KASSERT(B_FALSE, 878 ("Rx descriptor with both IPv4 and IPv6 flags")); 879 goto discard; 880 } 881 882 /* Pass packet up the stack or into LRO (pipelined) */ 883 if (prev != NULL) { 884 if (lro_enabled && 885 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 886 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 887 sfxge_lro(rxq, prev); 888 else 889 sfxge_rx_deliver(sc, prev); 890 } 891 prev = rx_desc; 892 continue; 893 894discard: 895 /* Return the packet to the pool */ 896 m_free(m); 897 rx_desc->mbuf = NULL; 898 } 899 rxq->completed = completed; 900 901 level = rxq->added - rxq->completed; 902 903 /* Pass last packet up the stack or into LRO */ 904 if (prev != NULL) { 905 if (lro_enabled && 906 ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) == 907 (EFX_PKT_TCP | EFX_CKSUM_TCPUDP))) 908 sfxge_lro(rxq, prev); 909 else 910 sfxge_rx_deliver(sc, prev); 911 } 912 913 /* 914 * If there are any pending flows and this is the end of the 915 * poll then they must be completed. 916 */ 917 if (eop) 918 sfxge_lro_end_of_burst(rxq); 919 920 /* Top up the queue if necessary */ 921 if (level < rxq->refill_threshold) 922 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE); 923} 924 925static void 926sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index) 927{ 928 struct sfxge_rxq *rxq; 929 struct sfxge_evq *evq; 930 unsigned int count; 931 unsigned int retry = 3; 932 933 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 934 935 rxq = sc->rxq[index]; 936 evq = sc->evq[index]; 937 938 SFXGE_EVQ_LOCK(evq); 939 940 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED, 941 ("rxq not started")); 942 943 rxq->init_state = SFXGE_RXQ_INITIALIZED; 944 945 callout_stop(&rxq->refill_callout); 946 947 while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) { 948 rxq->flush_state = SFXGE_FLUSH_PENDING; 949 950 SFXGE_EVQ_UNLOCK(evq); 951 952 /* Flush the receive queue */ 953 if (efx_rx_qflush(rxq->common) != 0) { 954 SFXGE_EVQ_LOCK(evq); 955 rxq->flush_state = SFXGE_FLUSH_FAILED; 956 break; 957 } 958 959 count = 0; 960 do { 961 /* Spin for 100 ms */ 962 DELAY(100000); 963 964 if (rxq->flush_state != SFXGE_FLUSH_PENDING) 965 break; 966 967 } while (++count < 20); 968 969 SFXGE_EVQ_LOCK(evq); 970 971 if (rxq->flush_state == SFXGE_FLUSH_PENDING) { 972 /* Flush timeout - neither done nor failed */ 973 log(LOG_ERR, "%s: Cannot flush Rx queue %u\n", 974 device_get_nameunit(sc->dev), index); 975 rxq->flush_state = SFXGE_FLUSH_DONE; 976 } 977 retry--; 978 } 979 if (rxq->flush_state == SFXGE_FLUSH_FAILED) { 980 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n", 981 device_get_nameunit(sc->dev), index); 982 rxq->flush_state = SFXGE_FLUSH_DONE; 983 } 984 985 rxq->pending = rxq->added; 986 sfxge_rx_qcomplete(rxq, B_TRUE); 987 988 KASSERT(rxq->completed == rxq->pending, 989 ("rxq->completed != rxq->pending")); 990 991 rxq->added = 0; 992 rxq->pushed = 0; 993 rxq->pending = 0; 994 rxq->completed = 0; 995 rxq->loopback = 0; 996 997 /* Destroy the common code receive queue. */ 998 efx_rx_qdestroy(rxq->common); 999 1000 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1001 EFX_RXQ_NBUFS(sc->rxq_entries)); 1002 1003 SFXGE_EVQ_UNLOCK(evq); 1004} 1005 1006static int 1007sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index) 1008{ 1009 struct sfxge_rxq *rxq; 1010 efsys_mem_t *esmp; 1011 struct sfxge_evq *evq; 1012 int rc; 1013 1014 SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc); 1015 1016 rxq = sc->rxq[index]; 1017 esmp = &rxq->mem; 1018 evq = sc->evq[index]; 1019 1020 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1021 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1022 KASSERT(evq->init_state == SFXGE_EVQ_STARTED, 1023 ("evq->init_state != SFXGE_EVQ_STARTED")); 1024 1025 /* Program the buffer table. */ 1026 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp, 1027 EFX_RXQ_NBUFS(sc->rxq_entries))) != 0) 1028 return (rc); 1029 1030 /* Create the common code receive queue. */ 1031 if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT, 1032 esmp, sc->rxq_entries, rxq->buf_base_id, evq->common, 1033 &rxq->common)) != 0) 1034 goto fail; 1035 1036 SFXGE_EVQ_LOCK(evq); 1037 1038 /* Enable the receive queue. */ 1039 efx_rx_qenable(rxq->common); 1040 1041 rxq->init_state = SFXGE_RXQ_STARTED; 1042 rxq->flush_state = SFXGE_FLUSH_REQUIRED; 1043 1044 /* Try to fill the queue from the pool. */ 1045 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE); 1046 1047 SFXGE_EVQ_UNLOCK(evq); 1048 1049 return (0); 1050 1051fail: 1052 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id, 1053 EFX_RXQ_NBUFS(sc->rxq_entries)); 1054 return (rc); 1055} 1056 1057void 1058sfxge_rx_stop(struct sfxge_softc *sc) 1059{ 1060 int index; 1061 1062 efx_mac_filter_default_rxq_clear(sc->enp); 1063 1064 /* Stop the receive queue(s) */ 1065 index = sc->rxq_count; 1066 while (--index >= 0) 1067 sfxge_rx_qstop(sc, index); 1068 1069 sc->rx_prefix_size = 0; 1070 sc->rx_buffer_size = 0; 1071 1072 efx_rx_fini(sc->enp); 1073} 1074 1075int 1076sfxge_rx_start(struct sfxge_softc *sc) 1077{ 1078 struct sfxge_intr *intr; 1079 const efx_nic_cfg_t *encp; 1080 size_t hdrlen, align, reserved; 1081 int index; 1082 int rc; 1083 1084 intr = &sc->intr; 1085 1086 /* Initialize the common code receive module. */ 1087 if ((rc = efx_rx_init(sc->enp)) != 0) 1088 return (rc); 1089 1090 encp = efx_nic_cfg_get(sc->enp); 1091 sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu); 1092 1093 /* Calculate the receive packet buffer size. */ 1094 sc->rx_prefix_size = encp->enc_rx_prefix_size; 1095 1096 /* Ensure IP headers are 32bit aligned */ 1097 hdrlen = sc->rx_prefix_size + sizeof (struct ether_header); 1098 sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen; 1099 1100 sc->rx_buffer_size += sc->rx_buffer_align; 1101 1102 /* Align end of packet buffer for RX DMA end padding */ 1103 align = MAX(1, encp->enc_rx_buf_align_end); 1104 EFSYS_ASSERT(ISP2(align)); 1105 sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align); 1106 1107 /* 1108 * Standard mbuf zones only guarantee pointer-size alignment; 1109 * we need extra space to align to the cache line 1110 */ 1111 reserved = sc->rx_buffer_size + CACHE_LINE_SIZE; 1112 1113 /* Select zone for packet buffers */ 1114 if (reserved <= MCLBYTES) 1115 sc->rx_cluster_size = MCLBYTES; 1116 else if (reserved <= MJUMPAGESIZE) 1117 sc->rx_cluster_size = MJUMPAGESIZE; 1118 else if (reserved <= MJUM9BYTES) 1119 sc->rx_cluster_size = MJUM9BYTES; 1120 else 1121 sc->rx_cluster_size = MJUM16BYTES; 1122 1123 /* 1124 * Set up the scale table. Enable all hash types and hash insertion. 1125 */ 1126 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++) 1127 sc->rx_indir_table[index] = index % sc->rxq_count; 1128 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table, 1129 SFXGE_RX_SCALE_MAX)) != 0) 1130 goto fail; 1131 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ, 1132 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) | 1133 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE); 1134 1135 if ((rc = efx_rx_scale_key_set(sc->enp, toep_key, 1136 sizeof(toep_key))) != 0) 1137 goto fail; 1138 1139 /* Start the receive queue(s). */ 1140 for (index = 0; index < sc->rxq_count; index++) { 1141 if ((rc = sfxge_rx_qstart(sc, index)) != 0) 1142 goto fail2; 1143 } 1144 1145 rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common, 1146 sc->intr.n_alloc > 1); 1147 if (rc != 0) 1148 goto fail3; 1149 1150 return (0); 1151 1152fail3: 1153fail2: 1154 while (--index >= 0) 1155 sfxge_rx_qstop(sc, index); 1156 1157fail: 1158 efx_rx_fini(sc->enp); 1159 1160 return (rc); 1161} 1162 1163#ifdef SFXGE_LRO 1164 1165static void sfxge_lro_init(struct sfxge_rxq *rxq) 1166{ 1167 struct sfxge_lro_state *st = &rxq->lro; 1168 unsigned i; 1169 1170 st->conns_mask = lro_table_size - 1; 1171 KASSERT(!((st->conns_mask + 1) & st->conns_mask), 1172 ("lro_table_size must be a power of 2")); 1173 st->sc = rxq->sc; 1174 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]), 1175 M_SFXGE, M_WAITOK); 1176 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]), 1177 M_SFXGE, M_WAITOK); 1178 for (i = 0; i <= st->conns_mask; ++i) { 1179 TAILQ_INIT(&st->conns[i]); 1180 st->conns_n[i] = 0; 1181 } 1182 LIST_INIT(&st->active_conns); 1183 TAILQ_INIT(&st->free_conns); 1184} 1185 1186static void sfxge_lro_fini(struct sfxge_rxq *rxq) 1187{ 1188 struct sfxge_lro_state *st = &rxq->lro; 1189 struct sfxge_lro_conn *c; 1190 unsigned i; 1191 1192 /* Return cleanly if sfxge_lro_init() has not been called. */ 1193 if (st->conns == NULL) 1194 return; 1195 1196 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections")); 1197 1198 for (i = 0; i <= st->conns_mask; ++i) { 1199 while (!TAILQ_EMPTY(&st->conns[i])) { 1200 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq); 1201 sfxge_lro_drop(rxq, c); 1202 } 1203 } 1204 1205 while (!TAILQ_EMPTY(&st->free_conns)) { 1206 c = TAILQ_FIRST(&st->free_conns); 1207 TAILQ_REMOVE(&st->free_conns, c, link); 1208 KASSERT(!c->mbuf, ("found orphaned mbuf")); 1209 free(c, M_SFXGE); 1210 } 1211 1212 free(st->conns_n, M_SFXGE); 1213 free(st->conns, M_SFXGE); 1214 st->conns = NULL; 1215} 1216 1217#else 1218 1219static void 1220sfxge_lro_init(struct sfxge_rxq *rxq) 1221{ 1222} 1223 1224static void 1225sfxge_lro_fini(struct sfxge_rxq *rxq) 1226{ 1227} 1228 1229#endif /* SFXGE_LRO */ 1230 1231static void 1232sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index) 1233{ 1234 struct sfxge_rxq *rxq; 1235 1236 rxq = sc->rxq[index]; 1237 1238 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED, 1239 ("rxq->init_state != SFXGE_RXQ_INITIALIZED")); 1240 1241 /* Free the context array and the flow table. */ 1242 free(rxq->queue, M_SFXGE); 1243 sfxge_lro_fini(rxq); 1244 1245 /* Release DMA memory. */ 1246 sfxge_dma_free(&rxq->mem); 1247 1248 sc->rxq[index] = NULL; 1249 1250 free(rxq, M_SFXGE); 1251} 1252 1253static int 1254sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index) 1255{ 1256 struct sfxge_rxq *rxq; 1257 struct sfxge_evq *evq; 1258 efsys_mem_t *esmp; 1259 int rc; 1260 1261 KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count)); 1262 1263 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK); 1264 rxq->sc = sc; 1265 rxq->index = index; 1266 rxq->entries = sc->rxq_entries; 1267 rxq->ptr_mask = rxq->entries - 1; 1268 rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries); 1269 1270 sc->rxq[index] = rxq; 1271 esmp = &rxq->mem; 1272 1273 evq = sc->evq[index]; 1274 1275 /* Allocate and zero DMA space. */ 1276 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0) 1277 return (rc); 1278 1279 /* Allocate buffer table entries. */ 1280 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries), 1281 &rxq->buf_base_id); 1282 1283 /* Allocate the context array and the flow table. */ 1284 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries, 1285 M_SFXGE, M_WAITOK | M_ZERO); 1286 sfxge_lro_init(rxq); 1287 1288 callout_init(&rxq->refill_callout, 1); 1289 1290 rxq->init_state = SFXGE_RXQ_INITIALIZED; 1291 1292 return (0); 1293} 1294 1295static const struct { 1296 const char *name; 1297 size_t offset; 1298} sfxge_rx_stats[] = { 1299#define SFXGE_RX_STAT(name, member) \ 1300 { #name, offsetof(struct sfxge_rxq, member) } 1301#ifdef SFXGE_LRO 1302 SFXGE_RX_STAT(lro_merges, lro.n_merges), 1303 SFXGE_RX_STAT(lro_bursts, lro.n_bursts), 1304 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start), 1305 SFXGE_RX_STAT(lro_misorder, lro.n_misorder), 1306 SFXGE_RX_STAT(lro_too_many, lro.n_too_many), 1307 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream), 1308 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle), 1309 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed) 1310#endif 1311}; 1312 1313static int 1314sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS) 1315{ 1316 struct sfxge_softc *sc = arg1; 1317 unsigned int id = arg2; 1318 unsigned int sum, index; 1319 1320 /* Sum across all RX queues */ 1321 sum = 0; 1322 for (index = 0; index < sc->rxq_count; index++) 1323 sum += *(unsigned int *)((caddr_t)sc->rxq[index] + 1324 sfxge_rx_stats[id].offset); 1325 1326 return (SYSCTL_OUT(req, &sum, sizeof(sum))); 1327} 1328 1329static void 1330sfxge_rx_stat_init(struct sfxge_softc *sc) 1331{ 1332 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev); 1333 struct sysctl_oid_list *stat_list; 1334 unsigned int id; 1335 1336 stat_list = SYSCTL_CHILDREN(sc->stats_node); 1337 1338 for (id = 0; id < nitems(sfxge_rx_stats); id++) { 1339 SYSCTL_ADD_PROC( 1340 ctx, stat_list, 1341 OID_AUTO, sfxge_rx_stats[id].name, 1342 CTLTYPE_UINT|CTLFLAG_RD, 1343 sc, id, sfxge_rx_stat_handler, "IU", 1344 ""); 1345 } 1346} 1347 1348void 1349sfxge_rx_fini(struct sfxge_softc *sc) 1350{ 1351 int index; 1352 1353 index = sc->rxq_count; 1354 while (--index >= 0) 1355 sfxge_rx_qfini(sc, index); 1356 1357 sc->rxq_count = 0; 1358} 1359 1360int 1361sfxge_rx_init(struct sfxge_softc *sc) 1362{ 1363 struct sfxge_intr *intr; 1364 int index; 1365 int rc; 1366 1367#ifdef SFXGE_LRO 1368 if (!ISP2(lro_table_size)) { 1369 log(LOG_ERR, "%s=%u must be power of 2", 1370 SFXGE_LRO_PARAM(table_size), lro_table_size); 1371 rc = EINVAL; 1372 goto fail_lro_table_size; 1373 } 1374 1375 if (lro_idle_ticks == 0) 1376 lro_idle_ticks = hz / 10 + 1; /* 100 ms */ 1377#endif 1378 1379 intr = &sc->intr; 1380 1381 sc->rxq_count = intr->n_alloc; 1382 1383 KASSERT(intr->state == SFXGE_INTR_INITIALIZED, 1384 ("intr->state != SFXGE_INTR_INITIALIZED")); 1385 1386 /* Initialize the receive queue(s) - one per interrupt. */ 1387 for (index = 0; index < sc->rxq_count; index++) { 1388 if ((rc = sfxge_rx_qinit(sc, index)) != 0) 1389 goto fail; 1390 } 1391 1392 sfxge_rx_stat_init(sc); 1393 1394 return (0); 1395 1396fail: 1397 /* Tear down the receive queue(s). */ 1398 while (--index >= 0) 1399 sfxge_rx_qfini(sc, index); 1400 1401 sc->rxq_count = 0; 1402 1403#ifdef SFXGE_LRO 1404fail_lro_table_size: 1405#endif 1406 return (rc); 1407} 1408