1/*- 2 * Copyright (c) 2010-2012 Citrix Inc. 3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. 4 * Copyright (c) 2012 NetApp Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29/*- 30 * Copyright (c) 2004-2006 Kip Macy 31 * All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 */ 54 55#include <sys/cdefs.h> 56__FBSDID("$FreeBSD: stable/10/sys/dev/hyperv/netvsc/if_hn.c 356412 2020-01-06 09:51:54Z hselasky $"); 57 58#include "opt_inet6.h" 59#include "opt_inet.h" 60#include "opt_hn.h" 61 62#include <sys/param.h> 63#include <sys/systm.h> 64#include <sys/bus.h> 65#include <sys/counter.h> 66#include <sys/kernel.h> 67#include <sys/limits.h> 68#include <sys/malloc.h> 69#include <sys/mbuf.h> 70#include <sys/module.h> 71#include <sys/proc.h> 72#include <sys/queue.h> 73#include <sys/lock.h> 74#include <sys/rmlock.h> 75#include <sys/sbuf.h> 76#include <sys/smp.h> 77#include <sys/socket.h> 78#include <sys/sockio.h> 79#include <sys/sx.h> 80#include <sys/sysctl.h> 81#include <sys/taskqueue.h> 82#include <sys/buf_ring.h> 83#include <sys/eventhandler.h> 84 85#include <machine/atomic.h> 86#include <machine/in_cksum.h> 87 88#include <net/bpf.h> 89#include <net/ethernet.h> 90#include <net/if.h> 91#include <net/if_arp.h> 92#include <net/if_dl.h> 93#include <net/if_media.h> 94#include <net/if_types.h> 95#include <net/if_var.h> 96#include <net/if_vlan_var.h> 97#include <net/rndis.h> 98 99#include <netinet/in_systm.h> 100#include <netinet/in.h> 101#include <netinet/ip.h> 102#include <netinet/ip6.h> 103#include <netinet/tcp.h> 104#include <netinet/tcp_lro.h> 105#include <netinet/udp.h> 106 107#include <dev/hyperv/include/hyperv.h> 108#include <dev/hyperv/include/hyperv_busdma.h> 109#include <dev/hyperv/include/vmbus.h> 110#include <dev/hyperv/include/vmbus_xact.h> 111 112#include <dev/hyperv/netvsc/ndis.h> 113#include <dev/hyperv/netvsc/if_hnreg.h> 114#include <dev/hyperv/netvsc/if_hnvar.h> 115#include <dev/hyperv/netvsc/hn_nvs.h> 116#include <dev/hyperv/netvsc/hn_rndis.h> 117 118#include "vmbus_if.h" 119 120#define HN_IFSTART_SUPPORT 121 122/* NOTE: M_HASHTYPE_RSS_UDP_IPV4 is not available on stable/10. */ 123#ifndef M_HASHTYPE_RSS_UDP_IPV4 124#define M_HASHTYPE_RSS_UDP_IPV4 M_HASHTYPE_OPAQUE 125#endif 126 127#define HN_RING_CNT_DEF_MAX 8 128 129#define HN_VFMAP_SIZE_DEF 8 130 131#define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ 132 133/* YYY should get it from the underlying channel */ 134#define HN_TX_DESC_CNT 512 135 136#define HN_RNDIS_PKT_LEN \ 137 (sizeof(struct rndis_packet_msg) + \ 138 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ 139 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ 140 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ 141 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) 142#define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE 143#define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE 144 145#define HN_TX_DATA_BOUNDARY PAGE_SIZE 146#define HN_TX_DATA_MAXSIZE IP_MAXPACKET 147#define HN_TX_DATA_SEGSIZE PAGE_SIZE 148/* -1 for RNDIS packet message */ 149#define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) 150 151#define HN_DIRECT_TX_SIZE_DEF 128 152 153#define HN_EARLY_TXEOF_THRESH 8 154 155#define HN_PKTBUF_LEN_DEF (16 * 1024) 156 157#define HN_LROENT_CNT_DEF 128 158 159#define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) 160#define HN_LRO_LENLIM_DEF (25 * ETHERMTU) 161/* YYY 2*MTU is a bit rough, but should be good enough. */ 162#define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) 163 164#define HN_LRO_ACKCNT_DEF 1 165 166#define HN_LOCK_INIT(sc) \ 167 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) 168#define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) 169#define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) 170#define HN_LOCK(sc) \ 171do { \ 172 while (sx_try_xlock(&(sc)->hn_lock) == 0) \ 173 DELAY(1000); \ 174} while (0) 175#define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) 176 177#define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) 178#define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) 179#define HN_CSUM_IP_HWASSIST(sc) \ 180 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) 181#define HN_CSUM_IP6_HWASSIST(sc) \ 182 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) 183 184#define HN_PKTSIZE_MIN(align) \ 185 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ 186 HN_RNDIS_PKT_LEN, (align)) 187#define HN_PKTSIZE(m, align) \ 188 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) 189 190#define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) 191 192struct hn_txdesc { 193#ifndef HN_USE_TXDESC_BUFRING 194 SLIST_ENTRY(hn_txdesc) link; 195#endif 196 STAILQ_ENTRY(hn_txdesc) agg_link; 197 198 /* Aggregated txdescs, in sending order. */ 199 STAILQ_HEAD(, hn_txdesc) agg_list; 200 201 /* The oldest packet, if transmission aggregation happens. */ 202 struct mbuf *m; 203 struct hn_tx_ring *txr; 204 int refs; 205 uint32_t flags; /* HN_TXD_FLAG_ */ 206 struct hn_nvs_sendctx send_ctx; 207 uint32_t chim_index; 208 int chim_size; 209 210 bus_dmamap_t data_dmap; 211 212 bus_addr_t rndis_pkt_paddr; 213 struct rndis_packet_msg *rndis_pkt; 214 bus_dmamap_t rndis_pkt_dmap; 215}; 216 217#define HN_TXD_FLAG_ONLIST 0x0001 218#define HN_TXD_FLAG_DMAMAP 0x0002 219#define HN_TXD_FLAG_ONAGG 0x0004 220 221struct hn_rxinfo { 222 uint32_t vlan_info; 223 uint32_t csum_info; 224 uint32_t hash_info; 225 uint32_t hash_value; 226}; 227 228struct hn_rxvf_setarg { 229 struct hn_rx_ring *rxr; 230 struct ifnet *vf_ifp; 231}; 232 233#define HN_RXINFO_VLAN 0x0001 234#define HN_RXINFO_CSUM 0x0002 235#define HN_RXINFO_HASHINF 0x0004 236#define HN_RXINFO_HASHVAL 0x0008 237#define HN_RXINFO_ALL \ 238 (HN_RXINFO_VLAN | \ 239 HN_RXINFO_CSUM | \ 240 HN_RXINFO_HASHINF | \ 241 HN_RXINFO_HASHVAL) 242 243#define HN_NDIS_VLAN_INFO_INVALID 0xffffffff 244#define HN_NDIS_RXCSUM_INFO_INVALID 0 245#define HN_NDIS_HASH_INFO_INVALID 0 246 247static int hn_probe(device_t); 248static int hn_attach(device_t); 249static int hn_detach(device_t); 250static int hn_shutdown(device_t); 251static void hn_chan_callback(struct vmbus_channel *, 252 void *); 253 254static void hn_init(void *); 255static int hn_ioctl(struct ifnet *, u_long, caddr_t); 256#ifdef HN_IFSTART_SUPPORT 257static void hn_start(struct ifnet *); 258#endif 259static int hn_transmit(struct ifnet *, struct mbuf *); 260static void hn_xmit_qflush(struct ifnet *); 261static int hn_ifmedia_upd(struct ifnet *); 262static void hn_ifmedia_sts(struct ifnet *, 263 struct ifmediareq *); 264 265static void hn_ifnet_event(void *, struct ifnet *, int); 266static void hn_ifaddr_event(void *, struct ifnet *); 267static void hn_ifnet_attevent(void *, struct ifnet *); 268static void hn_ifnet_detevent(void *, struct ifnet *); 269static void hn_ifnet_lnkevent(void *, struct ifnet *, int); 270 271static bool hn_ismyvf(const struct hn_softc *, 272 const struct ifnet *); 273static void hn_rxvf_change(struct hn_softc *, 274 struct ifnet *, bool); 275static void hn_rxvf_set(struct hn_softc *, struct ifnet *); 276static void hn_rxvf_set_task(void *, int); 277static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *); 278static int hn_xpnt_vf_iocsetflags(struct hn_softc *); 279static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, 280 struct ifreq *); 281static void hn_xpnt_vf_saveifflags(struct hn_softc *); 282static bool hn_xpnt_vf_isready(struct hn_softc *); 283static void hn_xpnt_vf_setready(struct hn_softc *); 284static void hn_xpnt_vf_init_taskfunc(void *, int); 285static void hn_xpnt_vf_init(struct hn_softc *); 286static void hn_xpnt_vf_setenable(struct hn_softc *); 287static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); 288static void hn_vf_rss_fixup(struct hn_softc *, bool); 289static void hn_vf_rss_restore(struct hn_softc *); 290 291static int hn_rndis_rxinfo(const void *, int, 292 struct hn_rxinfo *); 293static void hn_rndis_rx_data(struct hn_rx_ring *, 294 const void *, int); 295static void hn_rndis_rx_status(struct hn_softc *, 296 const void *, int); 297static void hn_rndis_init_fixat(struct hn_softc *, int); 298 299static void hn_nvs_handle_notify(struct hn_softc *, 300 const struct vmbus_chanpkt_hdr *); 301static void hn_nvs_handle_comp(struct hn_softc *, 302 struct vmbus_channel *, 303 const struct vmbus_chanpkt_hdr *); 304static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, 305 struct vmbus_channel *, 306 const struct vmbus_chanpkt_hdr *); 307static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, 308 struct vmbus_channel *, uint64_t); 309 310#if __FreeBSD_version >= 1100099 311static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); 312static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); 313#endif 314static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); 315static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); 316#if __FreeBSD_version < 1100095 317static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); 318#else 319static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); 320#endif 321static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 322static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); 323static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); 324static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); 325static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); 326static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); 327static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); 328static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); 329static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); 330static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); 331static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); 332static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); 333static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); 334static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); 335static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); 336static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); 337static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); 338static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); 339static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); 340static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); 341static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); 342static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); 343static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); 344 345static void hn_stop(struct hn_softc *, bool); 346static void hn_init_locked(struct hn_softc *); 347static int hn_chan_attach(struct hn_softc *, 348 struct vmbus_channel *); 349static void hn_chan_detach(struct hn_softc *, 350 struct vmbus_channel *); 351static int hn_attach_subchans(struct hn_softc *); 352static void hn_detach_allchans(struct hn_softc *); 353static void hn_chan_rollup(struct hn_rx_ring *, 354 struct hn_tx_ring *); 355static void hn_set_ring_inuse(struct hn_softc *, int); 356static int hn_synth_attach(struct hn_softc *, int); 357static void hn_synth_detach(struct hn_softc *); 358static int hn_synth_alloc_subchans(struct hn_softc *, 359 int *); 360static bool hn_synth_attachable(const struct hn_softc *); 361static void hn_suspend(struct hn_softc *); 362static void hn_suspend_data(struct hn_softc *); 363static void hn_suspend_mgmt(struct hn_softc *); 364static void hn_resume(struct hn_softc *); 365static void hn_resume_data(struct hn_softc *); 366static void hn_resume_mgmt(struct hn_softc *); 367static void hn_suspend_mgmt_taskfunc(void *, int); 368static void hn_chan_drain(struct hn_softc *, 369 struct vmbus_channel *); 370static void hn_disable_rx(struct hn_softc *); 371static void hn_drain_rxtx(struct hn_softc *, int); 372static void hn_polling(struct hn_softc *, u_int); 373static void hn_chan_polling(struct vmbus_channel *, u_int); 374static void hn_mtu_change_fixup(struct hn_softc *); 375 376static void hn_update_link_status(struct hn_softc *); 377static void hn_change_network(struct hn_softc *); 378static void hn_link_taskfunc(void *, int); 379static void hn_netchg_init_taskfunc(void *, int); 380static void hn_netchg_status_taskfunc(void *, int); 381static void hn_link_status(struct hn_softc *); 382 383static int hn_create_rx_data(struct hn_softc *, int); 384static void hn_destroy_rx_data(struct hn_softc *); 385static int hn_check_iplen(const struct mbuf *, int); 386static void hn_rxpkt_proto(const struct mbuf *, int *, int *); 387static int hn_set_rxfilter(struct hn_softc *, uint32_t); 388static int hn_rxfilter_config(struct hn_softc *); 389static int hn_rss_reconfig(struct hn_softc *); 390static void hn_rss_ind_fixup(struct hn_softc *); 391static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); 392static int hn_rxpkt(struct hn_rx_ring *, const void *, 393 int, const struct hn_rxinfo *); 394static uint32_t hn_rss_type_fromndis(uint32_t); 395static uint32_t hn_rss_type_tondis(uint32_t); 396 397static int hn_tx_ring_create(struct hn_softc *, int); 398static void hn_tx_ring_destroy(struct hn_tx_ring *); 399static int hn_create_tx_data(struct hn_softc *, int); 400static void hn_fixup_tx_data(struct hn_softc *); 401static void hn_fixup_rx_data(struct hn_softc *); 402static void hn_destroy_tx_data(struct hn_softc *); 403static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); 404static void hn_txdesc_gc(struct hn_tx_ring *, 405 struct hn_txdesc *); 406static int hn_encap(struct ifnet *, struct hn_tx_ring *, 407 struct hn_txdesc *, struct mbuf **); 408static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, 409 struct hn_txdesc *); 410static void hn_set_chim_size(struct hn_softc *, int); 411static void hn_set_tso_maxsize(struct hn_softc *, int, int); 412static bool hn_tx_ring_pending(struct hn_tx_ring *); 413static void hn_tx_ring_qflush(struct hn_tx_ring *); 414static void hn_resume_tx(struct hn_softc *, int); 415static void hn_set_txagg(struct hn_softc *); 416static void *hn_try_txagg(struct ifnet *, 417 struct hn_tx_ring *, struct hn_txdesc *, 418 int); 419static int hn_get_txswq_depth(const struct hn_tx_ring *); 420static void hn_txpkt_done(struct hn_nvs_sendctx *, 421 struct hn_softc *, struct vmbus_channel *, 422 const void *, int); 423static int hn_txpkt_sglist(struct hn_tx_ring *, 424 struct hn_txdesc *); 425static int hn_txpkt_chim(struct hn_tx_ring *, 426 struct hn_txdesc *); 427static int hn_xmit(struct hn_tx_ring *, int); 428static void hn_xmit_taskfunc(void *, int); 429static void hn_xmit_txeof(struct hn_tx_ring *); 430static void hn_xmit_txeof_taskfunc(void *, int); 431#ifdef HN_IFSTART_SUPPORT 432static int hn_start_locked(struct hn_tx_ring *, int); 433static void hn_start_taskfunc(void *, int); 434static void hn_start_txeof(struct hn_tx_ring *); 435static void hn_start_txeof_taskfunc(void *, int); 436#endif 437 438SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 439 "Hyper-V network interface"); 440 441/* Trust tcp segements verification on host side. */ 442static int hn_trust_hosttcp = 1; 443SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, 444 &hn_trust_hosttcp, 0, 445 "Trust tcp segement verification on host side, " 446 "when csum info is missing (global setting)"); 447 448/* Trust udp datagrams verification on host side. */ 449static int hn_trust_hostudp = 1; 450SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, 451 &hn_trust_hostudp, 0, 452 "Trust udp datagram verification on host side, " 453 "when csum info is missing (global setting)"); 454 455/* Trust ip packets verification on host side. */ 456static int hn_trust_hostip = 1; 457SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, 458 &hn_trust_hostip, 0, 459 "Trust ip packet verification on host side, " 460 "when csum info is missing (global setting)"); 461 462/* 463 * Offload UDP/IPv4 checksum. 464 */ 465static int hn_enable_udp4cs = 1; 466SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, 467 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); 468 469/* 470 * Offload UDP/IPv6 checksum. 471 */ 472static int hn_enable_udp6cs = 1; 473SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, 474 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); 475 476/* Stats. */ 477static counter_u64_t hn_udpcs_fixup; 478SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, 479 &hn_udpcs_fixup, "# of UDP checksum fixup"); 480 481/* 482 * See hn_set_hlen(). 483 * 484 * This value is for Azure. For Hyper-V, set this above 485 * 65536 to disable UDP datagram checksum fixup. 486 */ 487static int hn_udpcs_fixup_mtu = 1420; 488SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, 489 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); 490 491/* Limit TSO burst size */ 492static int hn_tso_maxlen = IP_MAXPACKET; 493SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, 494 &hn_tso_maxlen, 0, "TSO burst limit"); 495 496/* Limit chimney send size */ 497static int hn_tx_chimney_size = 0; 498SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, 499 &hn_tx_chimney_size, 0, "Chimney send packet size limit"); 500 501/* Limit the size of packet for direct transmission */ 502static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; 503SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, 504 &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); 505 506/* # of LRO entries per RX ring */ 507#if defined(INET) || defined(INET6) 508#if __FreeBSD_version >= 1100095 509static int hn_lro_entry_count = HN_LROENT_CNT_DEF; 510SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, 511 &hn_lro_entry_count, 0, "LRO entry count"); 512#endif 513#endif 514 515static int hn_tx_taskq_cnt = 1; 516SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, 517 &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); 518 519#define HN_TX_TASKQ_M_INDEP 0 520#define HN_TX_TASKQ_M_GLOBAL 1 521#define HN_TX_TASKQ_M_EVTTQ 2 522 523static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 524SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, 525 &hn_tx_taskq_mode, 0, "TX taskqueue modes: " 526 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); 527 528#ifndef HN_USE_TXDESC_BUFRING 529static int hn_use_txdesc_bufring = 0; 530#else 531static int hn_use_txdesc_bufring = 1; 532#endif 533SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, 534 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); 535 536#ifdef HN_IFSTART_SUPPORT 537/* Use ifnet.if_start instead of ifnet.if_transmit */ 538static int hn_use_if_start = 0; 539SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, 540 &hn_use_if_start, 0, "Use if_start TX method"); 541#endif 542 543/* # of channels to use */ 544static int hn_chan_cnt = 0; 545SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, 546 &hn_chan_cnt, 0, 547 "# of channels to use; each channel has one RX ring and one TX ring"); 548 549/* # of transmit rings to use */ 550static int hn_tx_ring_cnt = 0; 551SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, 552 &hn_tx_ring_cnt, 0, "# of TX rings to use"); 553 554/* Software TX ring deptch */ 555static int hn_tx_swq_depth = 0; 556SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, 557 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); 558 559/* Enable sorted LRO, and the depth of the per-channel mbuf queue */ 560#if __FreeBSD_version >= 1100095 561static u_int hn_lro_mbufq_depth = 0; 562SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, 563 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); 564#endif 565 566/* Packet transmission aggregation size limit */ 567static int hn_tx_agg_size = -1; 568SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, 569 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); 570 571/* Packet transmission aggregation count limit */ 572static int hn_tx_agg_pkts = -1; 573SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, 574 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); 575 576/* VF list */ 577SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING, 578 0, 0, hn_vflist_sysctl, "A", "VF list"); 579 580/* VF mapping */ 581SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING, 582 0, 0, hn_vfmap_sysctl, "A", "VF mapping"); 583 584/* Transparent VF */ 585static int hn_xpnt_vf = 1; 586SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, 587 &hn_xpnt_vf, 0, "Transparent VF mod"); 588 589/* Accurate BPF support for Transparent VF */ 590static int hn_xpnt_vf_accbpf = 0; 591SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, 592 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); 593 594/* Extra wait for transparent VF attach routing; unit seconds. */ 595static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 596SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, 597 &hn_xpnt_vf_attwait, 0, 598 "Extra wait for transparent VF attach routing; unit: seconds"); 599 600static u_int hn_cpu_index; /* next CPU for channel */ 601static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ 602 603static struct rmlock hn_vfmap_lock; 604static int hn_vfmap_size; 605static struct ifnet **hn_vfmap; 606 607static const uint8_t 608hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 609 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 610 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 611 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 612 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 613 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa 614}; 615 616static const struct hyperv_guid hn_guid = { 617 .hv_guid = { 618 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 619 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } 620}; 621 622static device_method_t hn_methods[] = { 623 /* Device interface */ 624 DEVMETHOD(device_probe, hn_probe), 625 DEVMETHOD(device_attach, hn_attach), 626 DEVMETHOD(device_detach, hn_detach), 627 DEVMETHOD(device_shutdown, hn_shutdown), 628 DEVMETHOD_END 629}; 630 631static driver_t hn_driver = { 632 "hn", 633 hn_methods, 634 sizeof(struct hn_softc) 635}; 636 637static devclass_t hn_devclass; 638 639DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); 640MODULE_VERSION(hn, 1); 641MODULE_DEPEND(hn, vmbus, 1, 1, 1); 642 643#if __FreeBSD_version >= 1100099 644static void 645hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) 646{ 647 int i; 648 649 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 650 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; 651} 652#endif 653 654static int 655hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) 656{ 657 658 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 659 txd->chim_size == 0, ("invalid rndis sglist txd")); 660 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, 661 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); 662} 663 664static int 665hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) 666{ 667 struct hn_nvs_rndis rndis; 668 669 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && 670 txd->chim_size > 0, ("invalid rndis chim txd")); 671 672 rndis.nvs_type = HN_NVS_TYPE_RNDIS; 673 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; 674 rndis.nvs_chim_idx = txd->chim_index; 675 rndis.nvs_chim_sz = txd->chim_size; 676 677 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, 678 &rndis, sizeof(rndis), &txd->send_ctx)); 679} 680 681static __inline uint32_t 682hn_chim_alloc(struct hn_softc *sc) 683{ 684 int i, bmap_cnt = sc->hn_chim_bmap_cnt; 685 u_long *bmap = sc->hn_chim_bmap; 686 uint32_t ret = HN_NVS_CHIM_IDX_INVALID; 687 688 for (i = 0; i < bmap_cnt; ++i) { 689 int idx; 690 691 idx = ffsl(~bmap[i]); 692 if (idx == 0) 693 continue; 694 695 --idx; /* ffsl is 1-based */ 696 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, 697 ("invalid i %d and idx %d", i, idx)); 698 699 if (atomic_testandset_long(&bmap[i], idx)) 700 continue; 701 702 ret = i * LONG_BIT + idx; 703 break; 704 } 705 return (ret); 706} 707 708static __inline void 709hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) 710{ 711 u_long mask; 712 uint32_t idx; 713 714 idx = chim_idx / LONG_BIT; 715 KASSERT(idx < sc->hn_chim_bmap_cnt, 716 ("invalid chimney index 0x%x", chim_idx)); 717 718 mask = 1UL << (chim_idx % LONG_BIT); 719 KASSERT(sc->hn_chim_bmap[idx] & mask, 720 ("index bitmap 0x%lx, chimney index %u, " 721 "bitmap idx %d, bitmask 0x%lx", 722 sc->hn_chim_bmap[idx], chim_idx, idx, mask)); 723 724 atomic_clear_long(&sc->hn_chim_bmap[idx], mask); 725} 726 727#if defined(INET6) || defined(INET) 728 729#define PULLUP_HDR(m, len) \ 730do { \ 731 if (__predict_false((m)->m_len < (len))) { \ 732 (m) = m_pullup((m), (len)); \ 733 if ((m) == NULL) \ 734 return (NULL); \ 735 } \ 736} while (0) 737 738/* 739 * NOTE: If this function failed, the m_head would be freed. 740 */ 741static __inline struct mbuf * 742hn_tso_fixup(struct mbuf *m_head) 743{ 744 struct ether_vlan_header *evl; 745 struct tcphdr *th; 746 int ehlen; 747 748 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); 749 750 PULLUP_HDR(m_head, sizeof(*evl)); 751 evl = mtod(m_head, struct ether_vlan_header *); 752 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 753 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 754 else 755 ehlen = ETHER_HDR_LEN; 756 m_head->m_pkthdr.l2hlen = ehlen; 757 758#ifdef INET 759 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 760 struct ip *ip; 761 int iphlen; 762 763 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 764 ip = mtodo(m_head, ehlen); 765 iphlen = ip->ip_hl << 2; 766 m_head->m_pkthdr.l3hlen = iphlen; 767 768 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 769 th = mtodo(m_head, ehlen + iphlen); 770 771 ip->ip_len = 0; 772 ip->ip_sum = 0; 773 th->th_sum = in_pseudo(ip->ip_src.s_addr, 774 ip->ip_dst.s_addr, htons(IPPROTO_TCP)); 775 } 776#endif 777#if defined(INET6) && defined(INET) 778 else 779#endif 780#ifdef INET6 781 { 782 struct ip6_hdr *ip6; 783 784 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 785 ip6 = mtodo(m_head, ehlen); 786 if (ip6->ip6_nxt != IPPROTO_TCP) { 787 m_freem(m_head); 788 return (NULL); 789 } 790 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 791 792 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); 793 th = mtodo(m_head, ehlen + sizeof(*ip6)); 794 795 ip6->ip6_plen = 0; 796 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); 797 } 798#endif 799 return (m_head); 800} 801 802/* 803 * NOTE: If this function failed, the m_head would be freed. 804 */ 805static __inline struct mbuf * 806hn_set_hlen(struct mbuf *m_head) 807{ 808 const struct ether_vlan_header *evl; 809 int ehlen; 810 811 PULLUP_HDR(m_head, sizeof(*evl)); 812 evl = mtod(m_head, const struct ether_vlan_header *); 813 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) 814 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; 815 else 816 ehlen = ETHER_HDR_LEN; 817 m_head->m_pkthdr.l2hlen = ehlen; 818 819#ifdef INET 820 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { 821 const struct ip *ip; 822 int iphlen; 823 824 PULLUP_HDR(m_head, ehlen + sizeof(*ip)); 825 ip = mtodo(m_head, ehlen); 826 iphlen = ip->ip_hl << 2; 827 m_head->m_pkthdr.l3hlen = iphlen; 828 829 /* 830 * UDP checksum offload does not work in Azure, if the 831 * following conditions meet: 832 * - sizeof(IP hdr + UDP hdr + payload) > 1420. 833 * - IP_DF is not set in the IP hdr. 834 * 835 * Fallback to software checksum for these UDP datagrams. 836 */ 837 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && 838 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && 839 (ntohs(ip->ip_off) & IP_DF) == 0) { 840 uint16_t off = ehlen + iphlen; 841 842 counter_u64_add(hn_udpcs_fixup, 1); 843 PULLUP_HDR(m_head, off + sizeof(struct udphdr)); 844 *(uint16_t *)(m_head->m_data + off + 845 m_head->m_pkthdr.csum_data) = in_cksum_skip( 846 m_head, m_head->m_pkthdr.len, off); 847 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; 848 } 849 } 850#endif 851#if defined(INET6) && defined(INET) 852 else 853#endif 854#ifdef INET6 855 { 856 const struct ip6_hdr *ip6; 857 858 PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); 859 ip6 = mtodo(m_head, ehlen); 860 if (ip6->ip6_nxt != IPPROTO_TCP && 861 ip6->ip6_nxt != IPPROTO_UDP) { 862 m_freem(m_head); 863 return (NULL); 864 } 865 m_head->m_pkthdr.l3hlen = sizeof(*ip6); 866 } 867#endif 868 return (m_head); 869} 870 871/* 872 * NOTE: If this function failed, the m_head would be freed. 873 */ 874static __inline struct mbuf * 875hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) 876{ 877 const struct tcphdr *th; 878 int ehlen, iphlen; 879 880 *tcpsyn = 0; 881 ehlen = m_head->m_pkthdr.l2hlen; 882 iphlen = m_head->m_pkthdr.l3hlen; 883 884 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); 885 th = mtodo(m_head, ehlen + iphlen); 886 if (th->th_flags & TH_SYN) 887 *tcpsyn = 1; 888 return (m_head); 889} 890 891#undef PULLUP_HDR 892 893#endif /* INET6 || INET */ 894 895static int 896hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) 897{ 898 int error = 0; 899 900 HN_LOCK_ASSERT(sc); 901 902 if (sc->hn_rx_filter != filter) { 903 error = hn_rndis_set_rxfilter(sc, filter); 904 if (!error) 905 sc->hn_rx_filter = filter; 906 } 907 return (error); 908} 909 910static int 911hn_rxfilter_config(struct hn_softc *sc) 912{ 913 struct ifnet *ifp = sc->hn_ifp; 914 uint32_t filter; 915 916 HN_LOCK_ASSERT(sc); 917 918 /* 919 * If the non-transparent mode VF is activated, we don't know how 920 * its RX filter is configured, so stick the synthetic device in 921 * the promiscous mode. 922 */ 923 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { 924 filter = NDIS_PACKET_TYPE_PROMISCUOUS; 925 } else { 926 filter = NDIS_PACKET_TYPE_DIRECTED; 927 if (ifp->if_flags & IFF_BROADCAST) 928 filter |= NDIS_PACKET_TYPE_BROADCAST; 929 /* TODO: support multicast list */ 930 if ((ifp->if_flags & IFF_ALLMULTI) || 931 !TAILQ_EMPTY(&ifp->if_multiaddrs)) 932 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; 933 } 934 return (hn_set_rxfilter(sc, filter)); 935} 936 937static void 938hn_set_txagg(struct hn_softc *sc) 939{ 940 uint32_t size, pkts; 941 int i; 942 943 /* 944 * Setup aggregation size. 945 */ 946 if (sc->hn_agg_size < 0) 947 size = UINT32_MAX; 948 else 949 size = sc->hn_agg_size; 950 951 if (sc->hn_rndis_agg_size < size) 952 size = sc->hn_rndis_agg_size; 953 954 /* NOTE: We only aggregate packets using chimney sending buffers. */ 955 if (size > (uint32_t)sc->hn_chim_szmax) 956 size = sc->hn_chim_szmax; 957 958 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { 959 /* Disable */ 960 size = 0; 961 pkts = 0; 962 goto done; 963 } 964 965 /* NOTE: Type of the per TX ring setting is 'int'. */ 966 if (size > INT_MAX) 967 size = INT_MAX; 968 969 /* 970 * Setup aggregation packet count. 971 */ 972 if (sc->hn_agg_pkts < 0) 973 pkts = UINT32_MAX; 974 else 975 pkts = sc->hn_agg_pkts; 976 977 if (sc->hn_rndis_agg_pkts < pkts) 978 pkts = sc->hn_rndis_agg_pkts; 979 980 if (pkts <= 1) { 981 /* Disable */ 982 size = 0; 983 pkts = 0; 984 goto done; 985 } 986 987 /* NOTE: Type of the per TX ring setting is 'short'. */ 988 if (pkts > SHRT_MAX) 989 pkts = SHRT_MAX; 990 991done: 992 /* NOTE: Type of the per TX ring setting is 'short'. */ 993 if (sc->hn_rndis_agg_align > SHRT_MAX) { 994 /* Disable */ 995 size = 0; 996 pkts = 0; 997 } 998 999 if (bootverbose) { 1000 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", 1001 size, pkts, sc->hn_rndis_agg_align); 1002 } 1003 1004 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 1005 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 1006 1007 mtx_lock(&txr->hn_tx_lock); 1008 txr->hn_agg_szmax = size; 1009 txr->hn_agg_pktmax = pkts; 1010 txr->hn_agg_align = sc->hn_rndis_agg_align; 1011 mtx_unlock(&txr->hn_tx_lock); 1012 } 1013} 1014 1015static int 1016hn_get_txswq_depth(const struct hn_tx_ring *txr) 1017{ 1018 1019 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); 1020 if (hn_tx_swq_depth < txr->hn_txdesc_cnt) 1021 return txr->hn_txdesc_cnt; 1022 return hn_tx_swq_depth; 1023} 1024 1025static int 1026hn_rss_reconfig(struct hn_softc *sc) 1027{ 1028 int error; 1029 1030 HN_LOCK_ASSERT(sc); 1031 1032 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1033 return (ENXIO); 1034 1035 /* 1036 * Disable RSS first. 1037 * 1038 * NOTE: 1039 * Direct reconfiguration by setting the UNCHG flags does 1040 * _not_ work properly. 1041 */ 1042 if (bootverbose) 1043 if_printf(sc->hn_ifp, "disable RSS\n"); 1044 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); 1045 if (error) { 1046 if_printf(sc->hn_ifp, "RSS disable failed\n"); 1047 return (error); 1048 } 1049 1050 /* 1051 * Reenable the RSS w/ the updated RSS key or indirect 1052 * table. 1053 */ 1054 if (bootverbose) 1055 if_printf(sc->hn_ifp, "reconfig RSS\n"); 1056 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 1057 if (error) { 1058 if_printf(sc->hn_ifp, "RSS reconfig failed\n"); 1059 return (error); 1060 } 1061 return (0); 1062} 1063 1064static void 1065hn_rss_ind_fixup(struct hn_softc *sc) 1066{ 1067 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 1068 int i, nchan; 1069 1070 nchan = sc->hn_rx_ring_inuse; 1071 KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); 1072 1073 /* 1074 * Check indirect table to make sure that all channels in it 1075 * can be used. 1076 */ 1077 for (i = 0; i < NDIS_HASH_INDCNT; ++i) { 1078 if (rss->rss_ind[i] >= nchan) { 1079 if_printf(sc->hn_ifp, 1080 "RSS indirect table %d fixup: %u -> %d\n", 1081 i, rss->rss_ind[i], nchan - 1); 1082 rss->rss_ind[i] = nchan - 1; 1083 } 1084 } 1085} 1086 1087static int 1088hn_ifmedia_upd(struct ifnet *ifp __unused) 1089{ 1090 1091 return EOPNOTSUPP; 1092} 1093 1094static void 1095hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) 1096{ 1097 struct hn_softc *sc = ifp->if_softc; 1098 1099 ifmr->ifm_status = IFM_AVALID; 1100 ifmr->ifm_active = IFM_ETHER; 1101 1102 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { 1103 ifmr->ifm_active |= IFM_NONE; 1104 return; 1105 } 1106 ifmr->ifm_status |= IFM_ACTIVE; 1107 ifmr->ifm_active |= IFM_10G_T | IFM_FDX; 1108} 1109 1110static void 1111hn_rxvf_set_task(void *xarg, int pending __unused) 1112{ 1113 struct hn_rxvf_setarg *arg = xarg; 1114 1115 arg->rxr->hn_rxvf_ifp = arg->vf_ifp; 1116} 1117 1118static void 1119hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp) 1120{ 1121 struct hn_rx_ring *rxr; 1122 struct hn_rxvf_setarg arg; 1123 struct task task; 1124 int i; 1125 1126 HN_LOCK_ASSERT(sc); 1127 1128 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); 1129 1130 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 1131 rxr = &sc->hn_rx_ring[i]; 1132 1133 if (i < sc->hn_rx_ring_inuse) { 1134 arg.rxr = rxr; 1135 arg.vf_ifp = vf_ifp; 1136 vmbus_chan_run_task(rxr->hn_chan, &task); 1137 } else { 1138 rxr->hn_rxvf_ifp = vf_ifp; 1139 } 1140 } 1141} 1142 1143static bool 1144hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp) 1145{ 1146 const struct ifnet *hn_ifp; 1147 1148 hn_ifp = sc->hn_ifp; 1149 1150 if (ifp == hn_ifp) 1151 return (false); 1152 1153 if (ifp->if_alloctype != IFT_ETHER) 1154 return (false); 1155 1156 /* Ignore lagg/vlan interfaces */ 1157 if (strcmp(ifp->if_dname, "lagg") == 0 || 1158 strcmp(ifp->if_dname, "vlan") == 0) 1159 return (false); 1160 1161 /* 1162 * During detach events ifp->if_addr might be NULL. 1163 * Make sure the bcmp() below doesn't panic on that: 1164 */ 1165 if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL) 1166 return (false); 1167 1168 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) 1169 return (false); 1170 1171 return (true); 1172} 1173 1174static void 1175hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf) 1176{ 1177 struct ifnet *hn_ifp; 1178 1179 HN_LOCK(sc); 1180 1181 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1182 goto out; 1183 1184 if (!hn_ismyvf(sc, ifp)) 1185 goto out; 1186 hn_ifp = sc->hn_ifp; 1187 1188 if (rxvf) { 1189 if (sc->hn_flags & HN_FLAG_RXVF) 1190 goto out; 1191 1192 sc->hn_flags |= HN_FLAG_RXVF; 1193 hn_rxfilter_config(sc); 1194 } else { 1195 if (!(sc->hn_flags & HN_FLAG_RXVF)) 1196 goto out; 1197 1198 sc->hn_flags &= ~HN_FLAG_RXVF; 1199 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING) 1200 hn_rxfilter_config(sc); 1201 else 1202 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); 1203 } 1204 1205 hn_nvs_set_datapath(sc, 1206 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); 1207 1208 hn_rxvf_set(sc, rxvf ? ifp : NULL); 1209 1210 if (rxvf) { 1211 hn_vf_rss_fixup(sc, true); 1212 hn_suspend_mgmt(sc); 1213 sc->hn_link_flags &= 1214 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); 1215 if_link_state_change(hn_ifp, LINK_STATE_DOWN); 1216 } else { 1217 hn_vf_rss_restore(sc); 1218 hn_resume_mgmt(sc); 1219 } 1220 1221 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname, 1222 rxvf ? "VF_UP" : "VF_DOWN", NULL); 1223 1224 if (bootverbose) { 1225 if_printf(hn_ifp, "datapath is switched %s %s\n", 1226 rxvf ? "to" : "from", ifp->if_xname); 1227 } 1228out: 1229 HN_UNLOCK(sc); 1230} 1231 1232static void 1233hn_ifnet_event(void *arg, struct ifnet *ifp, int event) 1234{ 1235 1236 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) 1237 return; 1238 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); 1239} 1240 1241static void 1242hn_ifaddr_event(void *arg, struct ifnet *ifp) 1243{ 1244 1245 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP); 1246} 1247 1248static int 1249hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) 1250{ 1251 struct ifnet *ifp, *vf_ifp; 1252 uint64_t tmp; 1253 int error; 1254 1255 HN_LOCK_ASSERT(sc); 1256 ifp = sc->hn_ifp; 1257 vf_ifp = sc->hn_vf_ifp; 1258 1259 /* 1260 * Fix up requested capabilities w/ supported capabilities, 1261 * since the supported capabilities could have been changed. 1262 */ 1263 ifr->ifr_reqcap &= ifp->if_capabilities; 1264 /* Pass SIOCSIFCAP to VF. */ 1265 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr); 1266 1267 /* 1268 * NOTE: 1269 * The error will be propagated to the callers, however, it 1270 * is _not_ useful here. 1271 */ 1272 1273 /* 1274 * Merge VF's enabled capabilities. 1275 */ 1276 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities; 1277 1278 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc); 1279 if (ifp->if_capenable & IFCAP_TXCSUM) 1280 ifp->if_hwassist |= tmp; 1281 else 1282 ifp->if_hwassist &= ~tmp; 1283 1284 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc); 1285 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 1286 ifp->if_hwassist |= tmp; 1287 else 1288 ifp->if_hwassist &= ~tmp; 1289 1290 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO; 1291 if (ifp->if_capenable & IFCAP_TSO4) 1292 ifp->if_hwassist |= tmp; 1293 else 1294 ifp->if_hwassist &= ~tmp; 1295 1296 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO; 1297 if (ifp->if_capenable & IFCAP_TSO6) 1298 ifp->if_hwassist |= tmp; 1299 else 1300 ifp->if_hwassist &= ~tmp; 1301 1302 return (error); 1303} 1304 1305static int 1306hn_xpnt_vf_iocsetflags(struct hn_softc *sc) 1307{ 1308 struct ifnet *vf_ifp; 1309 struct ifreq ifr; 1310 1311 HN_LOCK_ASSERT(sc); 1312 vf_ifp = sc->hn_vf_ifp; 1313 1314 memset(&ifr, 0, sizeof(ifr)); 1315 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1316 ifr.ifr_flags = vf_ifp->if_flags & 0xffff; 1317 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16; 1318 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); 1319} 1320 1321static void 1322hn_xpnt_vf_saveifflags(struct hn_softc *sc) 1323{ 1324 struct ifnet *ifp = sc->hn_ifp; 1325 int allmulti = 0; 1326 1327 HN_LOCK_ASSERT(sc); 1328 1329 /* XXX vlan(4) style mcast addr maintenance */ 1330 if (!TAILQ_EMPTY(&ifp->if_multiaddrs)) 1331 allmulti = IFF_ALLMULTI; 1332 1333 /* Always set the VF's if_flags */ 1334 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti; 1335} 1336 1337static void 1338hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m) 1339{ 1340 struct rm_priotracker pt; 1341 struct ifnet *hn_ifp = NULL; 1342 struct mbuf *mn; 1343 1344 /* 1345 * XXX racy, if hn(4) ever detached. 1346 */ 1347 rm_rlock(&hn_vfmap_lock, &pt); 1348 if (vf_ifp->if_index < hn_vfmap_size) 1349 hn_ifp = hn_vfmap[vf_ifp->if_index]; 1350 rm_runlock(&hn_vfmap_lock, &pt); 1351 1352 if (hn_ifp != NULL) { 1353 for (mn = m; mn != NULL; mn = mn->m_nextpkt) { 1354 /* 1355 * Allow tapping on the VF. 1356 */ 1357 ETHER_BPF_MTAP(vf_ifp, mn); 1358 1359 /* 1360 * Update VF stats. 1361 */ 1362 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) { 1363 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, 1364 mn->m_pkthdr.len); 1365 } 1366 /* 1367 * XXX IFCOUNTER_IMCAST 1368 * This stat updating is kinda invasive, since it 1369 * requires two checks on the mbuf: the length check 1370 * and the ethernet header check. As of this write, 1371 * all multicast packets go directly to hn(4), which 1372 * makes imcast stat updating in the VF a try in vian. 1373 */ 1374 1375 /* 1376 * Fix up rcvif and increase hn(4)'s ipackets. 1377 */ 1378 mn->m_pkthdr.rcvif = hn_ifp; 1379 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 1380 } 1381 /* 1382 * Go through hn(4)'s if_input. 1383 */ 1384 hn_ifp->if_input(hn_ifp, m); 1385 } else { 1386 /* 1387 * In the middle of the transition; free this 1388 * mbuf chain. 1389 */ 1390 while (m != NULL) { 1391 mn = m->m_nextpkt; 1392 m->m_nextpkt = NULL; 1393 m_freem(m); 1394 m = mn; 1395 } 1396 } 1397} 1398 1399static void 1400hn_mtu_change_fixup(struct hn_softc *sc) 1401{ 1402 struct ifnet *ifp; 1403 1404 HN_LOCK_ASSERT(sc); 1405 ifp = sc->hn_ifp; 1406 1407 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); 1408#if __FreeBSD_version >= 1100099 1409 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) 1410 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); 1411#endif 1412} 1413 1414static uint32_t 1415hn_rss_type_fromndis(uint32_t rss_hash) 1416{ 1417 uint32_t types = 0; 1418 1419 if (rss_hash & NDIS_HASH_IPV4) 1420 types |= RSS_TYPE_IPV4; 1421 if (rss_hash & NDIS_HASH_TCP_IPV4) 1422 types |= RSS_TYPE_TCP_IPV4; 1423 if (rss_hash & NDIS_HASH_IPV6) 1424 types |= RSS_TYPE_IPV6; 1425 if (rss_hash & NDIS_HASH_IPV6_EX) 1426 types |= RSS_TYPE_IPV6_EX; 1427 if (rss_hash & NDIS_HASH_TCP_IPV6) 1428 types |= RSS_TYPE_TCP_IPV6; 1429 if (rss_hash & NDIS_HASH_TCP_IPV6_EX) 1430 types |= RSS_TYPE_TCP_IPV6_EX; 1431 if (rss_hash & NDIS_HASH_UDP_IPV4_X) 1432 types |= RSS_TYPE_UDP_IPV4; 1433 return (types); 1434} 1435 1436static uint32_t 1437hn_rss_type_tondis(uint32_t types) 1438{ 1439 uint32_t rss_hash = 0; 1440 1441 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, 1442 ("UDP6 and UDP6EX are not supported")); 1443 1444 if (types & RSS_TYPE_IPV4) 1445 rss_hash |= NDIS_HASH_IPV4; 1446 if (types & RSS_TYPE_TCP_IPV4) 1447 rss_hash |= NDIS_HASH_TCP_IPV4; 1448 if (types & RSS_TYPE_IPV6) 1449 rss_hash |= NDIS_HASH_IPV6; 1450 if (types & RSS_TYPE_IPV6_EX) 1451 rss_hash |= NDIS_HASH_IPV6_EX; 1452 if (types & RSS_TYPE_TCP_IPV6) 1453 rss_hash |= NDIS_HASH_TCP_IPV6; 1454 if (types & RSS_TYPE_TCP_IPV6_EX) 1455 rss_hash |= NDIS_HASH_TCP_IPV6_EX; 1456 if (types & RSS_TYPE_UDP_IPV4) 1457 rss_hash |= NDIS_HASH_UDP_IPV4_X; 1458 return (rss_hash); 1459} 1460 1461static void 1462hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) 1463{ 1464 int i; 1465 1466 HN_LOCK_ASSERT(sc); 1467 1468 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1469 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; 1470} 1471 1472static void 1473hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) 1474{ 1475 struct ifnet *ifp, *vf_ifp; 1476 struct ifrsshash ifrh; 1477 struct ifrsskey ifrk; 1478 int error; 1479 uint32_t my_types, diff_types, mbuf_types = 0; 1480 1481 HN_LOCK_ASSERT(sc); 1482 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1483 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1484 1485 if (sc->hn_rx_ring_inuse == 1) { 1486 /* No RSS on synthetic parts; done. */ 1487 return; 1488 } 1489 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { 1490 /* Synthetic parts do not support Toeplitz; done. */ 1491 return; 1492 } 1493 1494 ifp = sc->hn_ifp; 1495 vf_ifp = sc->hn_vf_ifp; 1496 1497 /* 1498 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is 1499 * supported. 1500 */ 1501 memset(&ifrk, 0, sizeof(ifrk)); 1502 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name)); 1503 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk); 1504 if (error) { 1505 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", 1506 vf_ifp->if_xname, error); 1507 goto done; 1508 } 1509 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { 1510 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1511 vf_ifp->if_xname, ifrk.ifrk_func); 1512 goto done; 1513 } 1514 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { 1515 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", 1516 vf_ifp->if_xname, ifrk.ifrk_keylen); 1517 goto done; 1518 } 1519 1520 /* 1521 * Extract VF's RSS hash. Only Toeplitz is supported. 1522 */ 1523 memset(&ifrh, 0, sizeof(ifrh)); 1524 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name)); 1525 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh); 1526 if (error) { 1527 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", 1528 vf_ifp->if_xname, error); 1529 goto done; 1530 } 1531 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { 1532 if_printf(ifp, "%s RSS function %u is not Toeplitz\n", 1533 vf_ifp->if_xname, ifrh.ifrh_func); 1534 goto done; 1535 } 1536 1537 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); 1538 if ((ifrh.ifrh_types & my_types) == 0) { 1539 /* This disables RSS; ignore it then */ 1540 if_printf(ifp, "%s intersection of RSS types failed. " 1541 "VF %#x, mine %#x\n", vf_ifp->if_xname, 1542 ifrh.ifrh_types, my_types); 1543 goto done; 1544 } 1545 1546 diff_types = my_types ^ ifrh.ifrh_types; 1547 my_types &= ifrh.ifrh_types; 1548 mbuf_types = my_types; 1549 1550 /* 1551 * Detect RSS hash value/type confliction. 1552 * 1553 * NOTE: 1554 * We don't disable the hash type, but stop delivery the hash 1555 * value/type through mbufs on RX path. 1556 * 1557 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple 1558 * hash is delivered with type of TCP_IPV4. This means if 1559 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at 1560 * least to hn_mbuf_hash. However, given that _all_ of the 1561 * NICs implement TCP_IPV4, this will _not_ impose any issues 1562 * here. 1563 */ 1564 if ((my_types & RSS_TYPE_IPV4) && 1565 (diff_types & ifrh.ifrh_types & 1566 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { 1567 /* Conflict; disable IPV4 hash type/value delivery. */ 1568 if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); 1569 mbuf_types &= ~RSS_TYPE_IPV4; 1570 } 1571 if ((my_types & RSS_TYPE_IPV6) && 1572 (diff_types & ifrh.ifrh_types & 1573 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1574 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1575 RSS_TYPE_IPV6_EX))) { 1576 /* Conflict; disable IPV6 hash type/value delivery. */ 1577 if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); 1578 mbuf_types &= ~RSS_TYPE_IPV6; 1579 } 1580 if ((my_types & RSS_TYPE_IPV6_EX) && 1581 (diff_types & ifrh.ifrh_types & 1582 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | 1583 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | 1584 RSS_TYPE_IPV6))) { 1585 /* Conflict; disable IPV6_EX hash type/value delivery. */ 1586 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); 1587 mbuf_types &= ~RSS_TYPE_IPV6_EX; 1588 } 1589 if ((my_types & RSS_TYPE_TCP_IPV6) && 1590 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { 1591 /* Conflict; disable TCP_IPV6 hash type/value delivery. */ 1592 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); 1593 mbuf_types &= ~RSS_TYPE_TCP_IPV6; 1594 } 1595 if ((my_types & RSS_TYPE_TCP_IPV6_EX) && 1596 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { 1597 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ 1598 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); 1599 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; 1600 } 1601 if ((my_types & RSS_TYPE_UDP_IPV6) && 1602 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { 1603 /* Conflict; disable UDP_IPV6 hash type/value delivery. */ 1604 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); 1605 mbuf_types &= ~RSS_TYPE_UDP_IPV6; 1606 } 1607 if ((my_types & RSS_TYPE_UDP_IPV6_EX) && 1608 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { 1609 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ 1610 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); 1611 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; 1612 } 1613 1614 /* 1615 * Indirect table does not matter. 1616 */ 1617 1618 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | 1619 hn_rss_type_tondis(my_types); 1620 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); 1621 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 1622 1623 if (reconf) { 1624 error = hn_rss_reconfig(sc); 1625 if (error) { 1626 /* XXX roll-back? */ 1627 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); 1628 /* XXX keep going. */ 1629 } 1630 } 1631done: 1632 /* Hash deliverability for mbufs. */ 1633 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); 1634} 1635 1636static void 1637hn_vf_rss_restore(struct hn_softc *sc) 1638{ 1639 1640 HN_LOCK_ASSERT(sc); 1641 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 1642 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname)); 1643 1644 if (sc->hn_rx_ring_inuse == 1) 1645 goto done; 1646 1647 /* 1648 * Restore hash types. Key does _not_ matter. 1649 */ 1650 if (sc->hn_rss_hash != sc->hn_rss_hcap) { 1651 int error; 1652 1653 sc->hn_rss_hash = sc->hn_rss_hcap; 1654 error = hn_rss_reconfig(sc); 1655 if (error) { 1656 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", 1657 error); 1658 /* XXX keep going. */ 1659 } 1660 } 1661done: 1662 /* Hash deliverability for mbufs. */ 1663 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); 1664} 1665 1666static void 1667hn_xpnt_vf_setready(struct hn_softc *sc) 1668{ 1669 struct ifnet *ifp, *vf_ifp; 1670 struct ifreq ifr; 1671 1672 HN_LOCK_ASSERT(sc); 1673 ifp = sc->hn_ifp; 1674 vf_ifp = sc->hn_vf_ifp; 1675 1676 /* 1677 * Mark the VF ready. 1678 */ 1679 sc->hn_vf_rdytick = 0; 1680 1681 /* 1682 * Save information for restoration. 1683 */ 1684 sc->hn_saved_caps = ifp->if_capabilities; 1685 sc->hn_saved_tsomax = ifp->if_hw_tsomax; 1686 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount; 1687 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize; 1688 1689 /* 1690 * Intersect supported/enabled capabilities. 1691 * 1692 * NOTE: 1693 * if_hwassist is not changed here. 1694 */ 1695 ifp->if_capabilities &= vf_ifp->if_capabilities; 1696 ifp->if_capenable &= ifp->if_capabilities; 1697 1698 /* 1699 * Fix TSO settings. 1700 */ 1701 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax) 1702 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax; 1703 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount) 1704 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount; 1705 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize) 1706 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize; 1707 1708 /* 1709 * Change VF's enabled capabilities. 1710 */ 1711 memset(&ifr, 0, sizeof(ifr)); 1712 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1713 ifr.ifr_reqcap = ifp->if_capenable; 1714 hn_xpnt_vf_iocsetcaps(sc, &ifr); 1715 1716 if (ifp->if_mtu != ETHERMTU) { 1717 int error; 1718 1719 /* 1720 * Change VF's MTU. 1721 */ 1722 memset(&ifr, 0, sizeof(ifr)); 1723 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); 1724 ifr.ifr_mtu = ifp->if_mtu; 1725 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr); 1726 if (error) { 1727 if_printf(ifp, "%s SIOCSIFMTU %lu failed\n", 1728 vf_ifp->if_xname, ifp->if_mtu); 1729 if (ifp->if_mtu > ETHERMTU) { 1730 if_printf(ifp, "change MTU to %d\n", ETHERMTU); 1731 1732 /* 1733 * XXX 1734 * No need to adjust the synthetic parts' MTU; 1735 * failure of the adjustment will cause us 1736 * infinite headache. 1737 */ 1738 ifp->if_mtu = ETHERMTU; 1739 hn_mtu_change_fixup(sc); 1740 } 1741 } 1742 } 1743} 1744 1745static bool 1746hn_xpnt_vf_isready(struct hn_softc *sc) 1747{ 1748 1749 HN_LOCK_ASSERT(sc); 1750 1751 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) 1752 return (false); 1753 1754 if (sc->hn_vf_rdytick == 0) 1755 return (true); 1756 1757 if (sc->hn_vf_rdytick > ticks) 1758 return (false); 1759 1760 /* Mark VF as ready. */ 1761 hn_xpnt_vf_setready(sc); 1762 return (true); 1763} 1764 1765static void 1766hn_xpnt_vf_setenable(struct hn_softc *sc) 1767{ 1768 int i; 1769 1770 HN_LOCK_ASSERT(sc); 1771 1772 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1773 rm_wlock(&sc->hn_vf_lock); 1774 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; 1775 rm_wunlock(&sc->hn_vf_lock); 1776 1777 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1778 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; 1779} 1780 1781static void 1782hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) 1783{ 1784 int i; 1785 1786 HN_LOCK_ASSERT(sc); 1787 1788 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1789 rm_wlock(&sc->hn_vf_lock); 1790 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; 1791 if (clear_vf) 1792 sc->hn_vf_ifp = NULL; 1793 rm_wunlock(&sc->hn_vf_lock); 1794 1795 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 1796 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; 1797} 1798 1799static void 1800hn_xpnt_vf_init(struct hn_softc *sc) 1801{ 1802 int error; 1803 1804 HN_LOCK_ASSERT(sc); 1805 1806 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1807 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1808 1809 if (bootverbose) { 1810 if_printf(sc->hn_ifp, "try bringing up %s\n", 1811 sc->hn_vf_ifp->if_xname); 1812 } 1813 1814 /* 1815 * Bring the VF up. 1816 */ 1817 hn_xpnt_vf_saveifflags(sc); 1818 sc->hn_vf_ifp->if_flags |= IFF_UP; 1819 error = hn_xpnt_vf_iocsetflags(sc); 1820 if (error) { 1821 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", 1822 sc->hn_vf_ifp->if_xname, error); 1823 return; 1824 } 1825 1826 /* 1827 * NOTE: 1828 * Datapath setting must happen _after_ bringing the VF up. 1829 */ 1830 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 1831 1832 /* 1833 * NOTE: 1834 * Fixup RSS related bits _after_ the VF is brought up, since 1835 * many VFs generate RSS key during it's initialization. 1836 */ 1837 hn_vf_rss_fixup(sc, true); 1838 1839 /* Mark transparent mode VF as enabled. */ 1840 hn_xpnt_vf_setenable(sc); 1841} 1842 1843static void 1844hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) 1845{ 1846 struct hn_softc *sc = xsc; 1847 1848 HN_LOCK(sc); 1849 1850 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 1851 goto done; 1852 if (sc->hn_vf_ifp == NULL) 1853 goto done; 1854 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 1855 goto done; 1856 1857 if (sc->hn_vf_rdytick != 0) { 1858 /* Mark VF as ready. */ 1859 hn_xpnt_vf_setready(sc); 1860 } 1861 1862 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) { 1863 /* 1864 * Delayed VF initialization. 1865 */ 1866 if (bootverbose) { 1867 if_printf(sc->hn_ifp, "delayed initialize %s\n", 1868 sc->hn_vf_ifp->if_xname); 1869 } 1870 hn_xpnt_vf_init(sc); 1871 } 1872done: 1873 HN_UNLOCK(sc); 1874} 1875 1876static void 1877hn_ifnet_attevent(void *xsc, struct ifnet *ifp) 1878{ 1879 struct hn_softc *sc = xsc; 1880 1881 HN_LOCK(sc); 1882 1883 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 1884 goto done; 1885 1886 if (!hn_ismyvf(sc, ifp)) 1887 goto done; 1888 1889 if (sc->hn_vf_ifp != NULL) { 1890 if_printf(sc->hn_ifp, "%s was attached as VF\n", 1891 sc->hn_vf_ifp->if_xname); 1892 goto done; 1893 } 1894 1895 if (hn_xpnt_vf && ifp->if_start != NULL) { 1896 /* 1897 * ifnet.if_start is _not_ supported by transparent 1898 * mode VF; mainly due to the IFF_DRV_OACTIVE flag. 1899 */ 1900 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " 1901 "in transparent VF mode.\n", ifp->if_xname); 1902 goto done; 1903 } 1904 1905 rm_wlock(&hn_vfmap_lock); 1906 1907 if (ifp->if_index >= hn_vfmap_size) { 1908 struct ifnet **newmap; 1909 int newsize; 1910 1911 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF; 1912 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF, 1913 M_WAITOK | M_ZERO); 1914 1915 memcpy(newmap, hn_vfmap, 1916 sizeof(struct ifnet *) * hn_vfmap_size); 1917 free(hn_vfmap, M_DEVBUF); 1918 hn_vfmap = newmap; 1919 hn_vfmap_size = newsize; 1920 } 1921 KASSERT(hn_vfmap[ifp->if_index] == NULL, 1922 ("%s: ifindex %d was mapped to %s", 1923 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); 1924 hn_vfmap[ifp->if_index] = sc->hn_ifp; 1925 1926 rm_wunlock(&hn_vfmap_lock); 1927 1928 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ 1929 rm_wlock(&sc->hn_vf_lock); 1930 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, 1931 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); 1932 sc->hn_vf_ifp = ifp; 1933 rm_wunlock(&sc->hn_vf_lock); 1934 1935 if (hn_xpnt_vf) { 1936 int wait_ticks; 1937 1938 /* 1939 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. 1940 * Save vf_ifp's current if_input for later restoration. 1941 */ 1942 sc->hn_vf_input = ifp->if_input; 1943 ifp->if_input = hn_xpnt_vf_input; 1944 1945 /* 1946 * Stop link status management; use the VF's. 1947 */ 1948 hn_suspend_mgmt(sc); 1949 1950 /* 1951 * Give VF sometime to complete its attach routing. 1952 */ 1953 wait_ticks = hn_xpnt_vf_attwait * hz; 1954 sc->hn_vf_rdytick = ticks + wait_ticks; 1955 1956 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, 1957 wait_ticks); 1958 } 1959done: 1960 HN_UNLOCK(sc); 1961} 1962 1963static void 1964hn_ifnet_detevent(void *xsc, struct ifnet *ifp) 1965{ 1966 struct hn_softc *sc = xsc; 1967 1968 HN_LOCK(sc); 1969 1970 if (sc->hn_vf_ifp == NULL) 1971 goto done; 1972 1973 if (!hn_ismyvf(sc, ifp)) 1974 goto done; 1975 1976 if (hn_xpnt_vf) { 1977 /* 1978 * Make sure that the delayed initialization is not running. 1979 * 1980 * NOTE: 1981 * - This lock _must_ be released, since the hn_vf_init task 1982 * will try holding this lock. 1983 * - It is safe to release this lock here, since the 1984 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. 1985 * 1986 * XXX racy, if hn(4) ever detached. 1987 */ 1988 HN_UNLOCK(sc); 1989 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); 1990 HN_LOCK(sc); 1991 1992 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", 1993 sc->hn_ifp->if_xname)); 1994 ifp->if_input = sc->hn_vf_input; 1995 sc->hn_vf_input = NULL; 1996 1997 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && 1998 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) 1999 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 2000 2001 if (sc->hn_vf_rdytick == 0) { 2002 /* 2003 * The VF was ready; restore some settings. 2004 */ 2005 sc->hn_ifp->if_capabilities = sc->hn_saved_caps; 2006 /* 2007 * NOTE: 2008 * There is _no_ need to fixup if_capenable and 2009 * if_hwassist, since the if_capabilities before 2010 * restoration was an intersection of the VF's 2011 * if_capabilites and the synthetic device's 2012 * if_capabilites. 2013 */ 2014 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax; 2015 sc->hn_ifp->if_hw_tsomaxsegcount = 2016 sc->hn_saved_tsosegcnt; 2017 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz; 2018 } 2019 2020 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2021 /* 2022 * Restore RSS settings. 2023 */ 2024 hn_vf_rss_restore(sc); 2025 2026 /* 2027 * Resume link status management, which was suspended 2028 * by hn_ifnet_attevent(). 2029 */ 2030 hn_resume_mgmt(sc); 2031 } 2032 } 2033 2034 /* Mark transparent mode VF as disabled. */ 2035 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); 2036 2037 rm_wlock(&hn_vfmap_lock); 2038 2039 KASSERT(ifp->if_index < hn_vfmap_size, 2040 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size)); 2041 if (hn_vfmap[ifp->if_index] != NULL) { 2042 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp, 2043 ("%s: ifindex %d was mapped to %s", 2044 ifp->if_xname, ifp->if_index, 2045 hn_vfmap[ifp->if_index]->if_xname)); 2046 hn_vfmap[ifp->if_index] = NULL; 2047 } 2048 2049 rm_wunlock(&hn_vfmap_lock); 2050done: 2051 HN_UNLOCK(sc); 2052} 2053 2054static void 2055hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state) 2056{ 2057 struct hn_softc *sc = xsc; 2058 2059 if (sc->hn_vf_ifp == ifp) 2060 if_link_state_change(sc->hn_ifp, link_state); 2061} 2062 2063static int 2064hn_probe(device_t dev) 2065{ 2066 2067 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { 2068 device_set_desc(dev, "Hyper-V Network Interface"); 2069 return BUS_PROBE_DEFAULT; 2070 } 2071 return ENXIO; 2072} 2073 2074static int 2075hn_attach(device_t dev) 2076{ 2077 struct hn_softc *sc = device_get_softc(dev); 2078 struct sysctl_oid_list *child; 2079 struct sysctl_ctx_list *ctx; 2080 uint8_t eaddr[ETHER_ADDR_LEN]; 2081 struct ifnet *ifp = NULL; 2082 int error, ring_cnt, tx_ring_cnt; 2083 uint32_t mtu; 2084 2085 sc->hn_dev = dev; 2086 sc->hn_prichan = vmbus_get_channel(dev); 2087 HN_LOCK_INIT(sc); 2088 rm_init(&sc->hn_vf_lock, "hnvf"); 2089 if (hn_xpnt_vf && hn_xpnt_vf_accbpf) 2090 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 2091 2092 /* 2093 * Initialize these tunables once. 2094 */ 2095 sc->hn_agg_size = hn_tx_agg_size; 2096 sc->hn_agg_pkts = hn_tx_agg_pkts; 2097 2098 /* 2099 * Setup taskqueue for transmission. 2100 */ 2101 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { 2102 int i; 2103 2104 sc->hn_tx_taskqs = 2105 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 2106 M_DEVBUF, M_WAITOK); 2107 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 2108 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", 2109 M_WAITOK, taskqueue_thread_enqueue, 2110 &sc->hn_tx_taskqs[i]); 2111 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, 2112 "%s tx%d", device_get_nameunit(dev), i); 2113 } 2114 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { 2115 sc->hn_tx_taskqs = hn_tx_taskque; 2116 } 2117 2118 /* 2119 * Setup taskqueue for mangement tasks, e.g. link status. 2120 */ 2121 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, 2122 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); 2123 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", 2124 device_get_nameunit(dev)); 2125 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); 2126 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); 2127 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, 2128 hn_netchg_status_taskfunc, sc); 2129 2130 if (hn_xpnt_vf) { 2131 /* 2132 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. 2133 */ 2134 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, 2135 taskqueue_thread_enqueue, &sc->hn_vf_taskq); 2136 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", 2137 device_get_nameunit(dev)); 2138 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, 2139 hn_xpnt_vf_init_taskfunc, sc); 2140 } 2141 2142 /* 2143 * Allocate ifnet and setup its name earlier, so that if_printf 2144 * can be used by functions, which will be called after 2145 * ether_ifattach(). 2146 */ 2147 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER); 2148 ifp->if_softc = sc; 2149 if_initname(ifp, device_get_name(dev), device_get_unit(dev)); 2150 2151 /* 2152 * Initialize ifmedia earlier so that it can be unconditionally 2153 * destroyed, if error happened later on. 2154 */ 2155 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); 2156 2157 /* 2158 * Figure out the # of RX rings (ring_cnt) and the # of TX rings 2159 * to use (tx_ring_cnt). 2160 * 2161 * NOTE: 2162 * The # of RX rings to use is same as the # of channels to use. 2163 */ 2164 ring_cnt = hn_chan_cnt; 2165 if (ring_cnt <= 0) { 2166 /* Default */ 2167 ring_cnt = mp_ncpus; 2168 if (ring_cnt > HN_RING_CNT_DEF_MAX) 2169 ring_cnt = HN_RING_CNT_DEF_MAX; 2170 } else if (ring_cnt > mp_ncpus) { 2171 ring_cnt = mp_ncpus; 2172 } 2173 2174 tx_ring_cnt = hn_tx_ring_cnt; 2175 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) 2176 tx_ring_cnt = ring_cnt; 2177#ifdef HN_IFSTART_SUPPORT 2178 if (hn_use_if_start) { 2179 /* ifnet.if_start only needs one TX ring. */ 2180 tx_ring_cnt = 1; 2181 } 2182#endif 2183 2184 /* 2185 * Set the leader CPU for channels. 2186 */ 2187 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; 2188 2189 /* 2190 * Create enough TX/RX rings, even if only limited number of 2191 * channels can be allocated. 2192 */ 2193 error = hn_create_tx_data(sc, tx_ring_cnt); 2194 if (error) 2195 goto failed; 2196 error = hn_create_rx_data(sc, ring_cnt); 2197 if (error) 2198 goto failed; 2199 2200 /* 2201 * Create transaction context for NVS and RNDIS transactions. 2202 */ 2203 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), 2204 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); 2205 if (sc->hn_xact == NULL) { 2206 error = ENXIO; 2207 goto failed; 2208 } 2209 2210 /* 2211 * Install orphan handler for the revocation of this device's 2212 * primary channel. 2213 * 2214 * NOTE: 2215 * The processing order is critical here: 2216 * Install the orphan handler, _before_ testing whether this 2217 * device's primary channel has been revoked or not. 2218 */ 2219 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); 2220 if (vmbus_chan_is_revoked(sc->hn_prichan)) { 2221 error = ENXIO; 2222 goto failed; 2223 } 2224 2225 /* 2226 * Attach the synthetic parts, i.e. NVS and RNDIS. 2227 */ 2228 error = hn_synth_attach(sc, ETHERMTU); 2229 if (error) 2230 goto failed; 2231 2232 error = hn_rndis_get_eaddr(sc, eaddr); 2233 if (error) 2234 goto failed; 2235 2236 error = hn_rndis_get_mtu(sc, &mtu); 2237 if (error) 2238 mtu = ETHERMTU; 2239 else if (bootverbose) 2240 device_printf(dev, "RNDIS mtu %u\n", mtu); 2241 2242#if __FreeBSD_version >= 1100099 2243 if (sc->hn_rx_ring_inuse > 1) { 2244 /* 2245 * Reduce TCP segment aggregation limit for multiple 2246 * RX rings to increase ACK timeliness. 2247 */ 2248 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); 2249 } 2250#endif 2251 2252 /* 2253 * Fixup TX/RX stuffs after synthetic parts are attached. 2254 */ 2255 hn_fixup_tx_data(sc); 2256 hn_fixup_rx_data(sc); 2257 2258 ctx = device_get_sysctl_ctx(dev); 2259 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 2260 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, 2261 &sc->hn_nvs_ver, 0, "NVS version"); 2262 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", 2263 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2264 hn_ndis_version_sysctl, "A", "NDIS version"); 2265 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", 2266 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2267 hn_caps_sysctl, "A", "capabilities"); 2268 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", 2269 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2270 hn_hwassist_sysctl, "A", "hwassist"); 2271 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max", 2272 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size"); 2273 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt", 2274 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0, 2275 "max # of TSO segments"); 2276 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz", 2277 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0, 2278 "max size of TSO segment"); 2279 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", 2280 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2281 hn_rxfilter_sysctl, "A", "rxfilter"); 2282 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", 2283 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2284 hn_rss_hash_sysctl, "A", "RSS hash"); 2285 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", 2286 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2287 hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); 2288 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", 2289 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2290 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); 2291 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", 2292 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); 2293 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", 2294 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2295 hn_rss_key_sysctl, "IU", "RSS key"); 2296 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", 2297 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2298 hn_rss_ind_sysctl, "IU", "RSS indirect table"); 2299 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", 2300 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, 2301 "RNDIS offered packet transmission aggregation size limit"); 2302 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", 2303 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, 2304 "RNDIS offered packet transmission aggregation count limit"); 2305 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", 2306 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, 2307 "RNDIS packet transmission aggregation alignment"); 2308 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", 2309 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2310 hn_txagg_size_sysctl, "I", 2311 "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); 2312 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", 2313 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2314 hn_txagg_pkts_sysctl, "I", 2315 "Packet transmission aggregation packets, " 2316 "0 -- disable, -1 -- auto"); 2317 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", 2318 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2319 hn_polling_sysctl, "I", 2320 "Polling frequency: [100,1000000], 0 disable polling"); 2321 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", 2322 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2323 hn_vf_sysctl, "A", "Virtual Function's name"); 2324 if (!hn_xpnt_vf) { 2325 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", 2326 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2327 hn_rxvf_sysctl, "A", "activated Virtual Function's name"); 2328 } else { 2329 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", 2330 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 2331 hn_xpnt_vf_enabled_sysctl, "I", 2332 "Transparent VF enabled"); 2333 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", 2334 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 2335 hn_xpnt_vf_accbpf_sysctl, "I", 2336 "Accurate BPF for transparent VF"); 2337 } 2338 2339 /* 2340 * Setup the ifmedia, which has been initialized earlier. 2341 */ 2342 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); 2343 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); 2344 /* XXX ifmedia_set really should do this for us */ 2345 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; 2346 2347 /* 2348 * Setup the ifnet for this interface. 2349 */ 2350 2351#ifdef __LP64__ 2352 ifp->if_baudrate = IF_Gbps(10); 2353#else 2354 /* if_baudrate is 32bits on 32bit system. */ 2355 ifp->if_baudrate = IF_Gbps(1); 2356#endif 2357 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; 2358 ifp->if_ioctl = hn_ioctl; 2359 ifp->if_init = hn_init; 2360#ifdef HN_IFSTART_SUPPORT 2361 if (hn_use_if_start) { 2362 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); 2363 2364 ifp->if_start = hn_start; 2365 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); 2366 ifp->if_snd.ifq_drv_maxlen = qdepth - 1; 2367 IFQ_SET_READY(&ifp->if_snd); 2368 } else 2369#endif 2370 { 2371 ifp->if_transmit = hn_transmit; 2372 ifp->if_qflush = hn_xmit_qflush; 2373 } 2374 2375 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE; 2376#ifdef foo 2377 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 2378 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; 2379#endif 2380 if (sc->hn_caps & HN_CAP_VLAN) { 2381 /* XXX not sure about VLAN_MTU. */ 2382 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; 2383 } 2384 2385 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; 2386 if (ifp->if_hwassist & HN_CSUM_IP_MASK) 2387 ifp->if_capabilities |= IFCAP_TXCSUM; 2388 if (ifp->if_hwassist & HN_CSUM_IP6_MASK) 2389 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; 2390 if (sc->hn_caps & HN_CAP_TSO4) { 2391 ifp->if_capabilities |= IFCAP_TSO4; 2392 ifp->if_hwassist |= CSUM_IP_TSO; 2393 } 2394 if (sc->hn_caps & HN_CAP_TSO6) { 2395 ifp->if_capabilities |= IFCAP_TSO6; 2396 ifp->if_hwassist |= CSUM_IP6_TSO; 2397 } 2398 2399 /* Enable all available capabilities by default. */ 2400 ifp->if_capenable = ifp->if_capabilities; 2401 2402 /* 2403 * Disable IPv6 TSO and TXCSUM by default, they still can 2404 * be enabled through SIOCSIFCAP. 2405 */ 2406 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); 2407 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); 2408 2409 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { 2410 /* 2411 * Lock hn_set_tso_maxsize() to simplify its 2412 * internal logic. 2413 */ 2414 HN_LOCK(sc); 2415 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); 2416 HN_UNLOCK(sc); 2417 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; 2418 ifp->if_hw_tsomaxsegsize = PAGE_SIZE; 2419 } 2420 2421 ether_ifattach(ifp, eaddr); 2422 2423 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { 2424 if_printf(ifp, "TSO segcnt %u segsz %u\n", 2425 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); 2426 } 2427 if (mtu < ETHERMTU) { 2428 if_printf(ifp, "fixup mtu %lu -> %u\n", ifp->if_mtu, mtu); 2429 ifp->if_mtu = mtu; 2430 } 2431 2432 /* Inform the upper layer about the long frame support. */ 2433 ifp->if_hdrlen = sizeof(struct ether_vlan_header); 2434 2435 /* 2436 * Kick off link status check. 2437 */ 2438 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 2439 hn_update_link_status(sc); 2440 2441 if (!hn_xpnt_vf) { 2442 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, 2443 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); 2444 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, 2445 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); 2446 } else { 2447 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, 2448 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); 2449 } 2450 2451 /* 2452 * NOTE: 2453 * Subscribe ether_ifattach event, instead of ifnet_arrival event, 2454 * since interface's LLADDR is needed; interface LLADDR is not 2455 * available when ifnet_arrival event is triggered. 2456 */ 2457 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, 2458 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); 2459 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, 2460 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); 2461 2462 return (0); 2463failed: 2464 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) 2465 hn_synth_detach(sc); 2466 hn_detach(dev); 2467 return (error); 2468} 2469 2470static int 2471hn_detach(device_t dev) 2472{ 2473 struct hn_softc *sc = device_get_softc(dev); 2474 struct ifnet *ifp = sc->hn_ifp, *vf_ifp; 2475 2476 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { 2477 /* 2478 * In case that the vmbus missed the orphan handler 2479 * installation. 2480 */ 2481 vmbus_xact_ctx_orphan(sc->hn_xact); 2482 } 2483 2484 if (sc->hn_ifaddr_evthand != NULL) 2485 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); 2486 if (sc->hn_ifnet_evthand != NULL) 2487 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); 2488 if (sc->hn_ifnet_atthand != NULL) { 2489 EVENTHANDLER_DEREGISTER(ether_ifattach_event, 2490 sc->hn_ifnet_atthand); 2491 } 2492 if (sc->hn_ifnet_dethand != NULL) { 2493 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2494 sc->hn_ifnet_dethand); 2495 } 2496 if (sc->hn_ifnet_lnkhand != NULL) 2497 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); 2498 2499 vf_ifp = sc->hn_vf_ifp; 2500 __compiler_membar(); 2501 if (vf_ifp != NULL) 2502 hn_ifnet_detevent(sc, vf_ifp); 2503 2504 if (device_is_attached(dev)) { 2505 HN_LOCK(sc); 2506 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 2507 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 2508 hn_stop(sc, true); 2509 /* 2510 * NOTE: 2511 * hn_stop() only suspends data, so managment 2512 * stuffs have to be suspended manually here. 2513 */ 2514 hn_suspend_mgmt(sc); 2515 hn_synth_detach(sc); 2516 } 2517 HN_UNLOCK(sc); 2518 ether_ifdetach(ifp); 2519 } 2520 2521 ifmedia_removeall(&sc->hn_media); 2522 hn_destroy_rx_data(sc); 2523 hn_destroy_tx_data(sc); 2524 2525 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { 2526 int i; 2527 2528 for (i = 0; i < hn_tx_taskq_cnt; ++i) 2529 taskqueue_free(sc->hn_tx_taskqs[i]); 2530 free(sc->hn_tx_taskqs, M_DEVBUF); 2531 } 2532 taskqueue_free(sc->hn_mgmt_taskq0); 2533 if (sc->hn_vf_taskq != NULL) 2534 taskqueue_free(sc->hn_vf_taskq); 2535 2536 if (sc->hn_xact != NULL) { 2537 /* 2538 * Uninstall the orphan handler _before_ the xact is 2539 * destructed. 2540 */ 2541 vmbus_chan_unset_orphan(sc->hn_prichan); 2542 vmbus_xact_ctx_destroy(sc->hn_xact); 2543 } 2544 2545 if_free(ifp); 2546 2547 HN_LOCK_DESTROY(sc); 2548 rm_destroy(&sc->hn_vf_lock); 2549 return (0); 2550} 2551 2552static int 2553hn_shutdown(device_t dev) 2554{ 2555 2556 return (0); 2557} 2558 2559static void 2560hn_link_status(struct hn_softc *sc) 2561{ 2562 uint32_t link_status; 2563 int error; 2564 2565 error = hn_rndis_get_linkstatus(sc, &link_status); 2566 if (error) { 2567 /* XXX what to do? */ 2568 return; 2569 } 2570 2571 if (link_status == NDIS_MEDIA_STATE_CONNECTED) 2572 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; 2573 else 2574 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2575 if_link_state_change(sc->hn_ifp, 2576 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? 2577 LINK_STATE_UP : LINK_STATE_DOWN); 2578} 2579 2580static void 2581hn_link_taskfunc(void *xsc, int pending __unused) 2582{ 2583 struct hn_softc *sc = xsc; 2584 2585 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 2586 return; 2587 hn_link_status(sc); 2588} 2589 2590static void 2591hn_netchg_init_taskfunc(void *xsc, int pending __unused) 2592{ 2593 struct hn_softc *sc = xsc; 2594 2595 /* Prevent any link status checks from running. */ 2596 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; 2597 2598 /* 2599 * Fake up a [link down --> link up] state change; 5 seconds 2600 * delay is used, which closely simulates miibus reaction 2601 * upon link down event. 2602 */ 2603 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; 2604 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); 2605 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, 2606 &sc->hn_netchg_status, 5 * hz); 2607} 2608 2609static void 2610hn_netchg_status_taskfunc(void *xsc, int pending __unused) 2611{ 2612 struct hn_softc *sc = xsc; 2613 2614 /* Re-allow link status checks. */ 2615 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; 2616 hn_link_status(sc); 2617} 2618 2619static void 2620hn_update_link_status(struct hn_softc *sc) 2621{ 2622 2623 if (sc->hn_mgmt_taskq != NULL) 2624 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); 2625} 2626 2627static void 2628hn_change_network(struct hn_softc *sc) 2629{ 2630 2631 if (sc->hn_mgmt_taskq != NULL) 2632 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); 2633} 2634 2635static __inline int 2636hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, 2637 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) 2638{ 2639 struct mbuf *m = *m_head; 2640 int error; 2641 2642 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); 2643 2644 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, 2645 m, segs, nsegs, BUS_DMA_NOWAIT); 2646 if (error == EFBIG) { 2647 struct mbuf *m_new; 2648 2649 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); 2650 if (m_new == NULL) 2651 return ENOBUFS; 2652 else 2653 *m_head = m = m_new; 2654 txr->hn_tx_collapsed++; 2655 2656 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, 2657 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); 2658 } 2659 if (!error) { 2660 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, 2661 BUS_DMASYNC_PREWRITE); 2662 txd->flags |= HN_TXD_FLAG_DMAMAP; 2663 } 2664 return error; 2665} 2666 2667static __inline int 2668hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) 2669{ 2670 2671 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, 2672 ("put an onlist txd %#x", txd->flags)); 2673 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2674 ("put an onagg txd %#x", txd->flags)); 2675 2676 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2677 if (atomic_fetchadd_int(&txd->refs, -1) != 1) 2678 return 0; 2679 2680 if (!STAILQ_EMPTY(&txd->agg_list)) { 2681 struct hn_txdesc *tmp_txd; 2682 2683 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { 2684 int freed; 2685 2686 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), 2687 ("resursive aggregation on aggregated txdesc")); 2688 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), 2689 ("not aggregated txdesc")); 2690 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2691 ("aggregated txdesc uses dmamap")); 2692 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 2693 ("aggregated txdesc consumes " 2694 "chimney sending buffer")); 2695 KASSERT(tmp_txd->chim_size == 0, 2696 ("aggregated txdesc has non-zero " 2697 "chimney sending size")); 2698 2699 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); 2700 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; 2701 freed = hn_txdesc_put(txr, tmp_txd); 2702 KASSERT(freed, ("failed to free aggregated txdesc")); 2703 } 2704 } 2705 2706 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { 2707 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, 2708 ("chim txd uses dmamap")); 2709 hn_chim_free(txr->hn_sc, txd->chim_index); 2710 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 2711 txd->chim_size = 0; 2712 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { 2713 bus_dmamap_sync(txr->hn_tx_data_dtag, 2714 txd->data_dmap, BUS_DMASYNC_POSTWRITE); 2715 bus_dmamap_unload(txr->hn_tx_data_dtag, 2716 txd->data_dmap); 2717 txd->flags &= ~HN_TXD_FLAG_DMAMAP; 2718 } 2719 2720 if (txd->m != NULL) { 2721 m_freem(txd->m); 2722 txd->m = NULL; 2723 } 2724 2725 txd->flags |= HN_TXD_FLAG_ONLIST; 2726#ifndef HN_USE_TXDESC_BUFRING 2727 mtx_lock_spin(&txr->hn_txlist_spin); 2728 KASSERT(txr->hn_txdesc_avail >= 0 && 2729 txr->hn_txdesc_avail < txr->hn_txdesc_cnt, 2730 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); 2731 txr->hn_txdesc_avail++; 2732 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 2733 mtx_unlock_spin(&txr->hn_txlist_spin); 2734#else /* HN_USE_TXDESC_BUFRING */ 2735#ifdef HN_DEBUG 2736 atomic_add_int(&txr->hn_txdesc_avail, 1); 2737#endif 2738 buf_ring_enqueue(txr->hn_txdesc_br, txd); 2739#endif /* !HN_USE_TXDESC_BUFRING */ 2740 2741 return 1; 2742} 2743 2744static __inline struct hn_txdesc * 2745hn_txdesc_get(struct hn_tx_ring *txr) 2746{ 2747 struct hn_txdesc *txd; 2748 2749#ifndef HN_USE_TXDESC_BUFRING 2750 mtx_lock_spin(&txr->hn_txlist_spin); 2751 txd = SLIST_FIRST(&txr->hn_txlist); 2752 if (txd != NULL) { 2753 KASSERT(txr->hn_txdesc_avail > 0, 2754 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); 2755 txr->hn_txdesc_avail--; 2756 SLIST_REMOVE_HEAD(&txr->hn_txlist, link); 2757 } 2758 mtx_unlock_spin(&txr->hn_txlist_spin); 2759#else 2760 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); 2761#endif 2762 2763 if (txd != NULL) { 2764#ifdef HN_USE_TXDESC_BUFRING 2765#ifdef HN_DEBUG 2766 atomic_subtract_int(&txr->hn_txdesc_avail, 1); 2767#endif 2768#endif /* HN_USE_TXDESC_BUFRING */ 2769 KASSERT(txd->m == NULL && txd->refs == 0 && 2770 STAILQ_EMPTY(&txd->agg_list) && 2771 txd->chim_index == HN_NVS_CHIM_IDX_INVALID && 2772 txd->chim_size == 0 && 2773 (txd->flags & HN_TXD_FLAG_ONLIST) && 2774 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && 2775 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); 2776 txd->flags &= ~HN_TXD_FLAG_ONLIST; 2777 txd->refs = 1; 2778 } 2779 return txd; 2780} 2781 2782static __inline void 2783hn_txdesc_hold(struct hn_txdesc *txd) 2784{ 2785 2786 /* 0->1 transition will never work */ 2787 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); 2788 atomic_add_int(&txd->refs, 1); 2789} 2790 2791static __inline void 2792hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) 2793{ 2794 2795 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2796 ("recursive aggregation on aggregating txdesc")); 2797 2798 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, 2799 ("already aggregated")); 2800 KASSERT(STAILQ_EMPTY(&txd->agg_list), 2801 ("recursive aggregation on to-be-aggregated txdesc")); 2802 2803 txd->flags |= HN_TXD_FLAG_ONAGG; 2804 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); 2805} 2806 2807static bool 2808hn_tx_ring_pending(struct hn_tx_ring *txr) 2809{ 2810 bool pending = false; 2811 2812#ifndef HN_USE_TXDESC_BUFRING 2813 mtx_lock_spin(&txr->hn_txlist_spin); 2814 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) 2815 pending = true; 2816 mtx_unlock_spin(&txr->hn_txlist_spin); 2817#else 2818 if (!buf_ring_full(txr->hn_txdesc_br)) 2819 pending = true; 2820#endif 2821 return (pending); 2822} 2823 2824static __inline void 2825hn_txeof(struct hn_tx_ring *txr) 2826{ 2827 txr->hn_has_txeof = 0; 2828 txr->hn_txeof(txr); 2829} 2830 2831static void 2832hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, 2833 struct vmbus_channel *chan, const void *data __unused, int dlen __unused) 2834{ 2835 struct hn_txdesc *txd = sndc->hn_cbarg; 2836 struct hn_tx_ring *txr; 2837 2838 txr = txd->txr; 2839 KASSERT(txr->hn_chan == chan, 2840 ("channel mismatch, on chan%u, should be chan%u", 2841 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); 2842 2843 txr->hn_has_txeof = 1; 2844 hn_txdesc_put(txr, txd); 2845 2846 ++txr->hn_txdone_cnt; 2847 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { 2848 txr->hn_txdone_cnt = 0; 2849 if (txr->hn_oactive) 2850 hn_txeof(txr); 2851 } 2852} 2853 2854static void 2855hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) 2856{ 2857#if defined(INET) || defined(INET6) 2858 struct lro_ctrl *lro = &rxr->hn_lro; 2859 struct lro_entry *queued; 2860 2861 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { 2862 SLIST_REMOVE_HEAD(&lro->lro_active, next); 2863 tcp_lro_flush(lro, queued); 2864 } 2865#endif 2866 2867 /* 2868 * NOTE: 2869 * 'txr' could be NULL, if multiple channels and 2870 * ifnet.if_start method are enabled. 2871 */ 2872 if (txr == NULL || !txr->hn_has_txeof) 2873 return; 2874 2875 txr->hn_txdone_cnt = 0; 2876 hn_txeof(txr); 2877} 2878 2879static __inline uint32_t 2880hn_rndis_pktmsg_offset(uint32_t ofs) 2881{ 2882 2883 KASSERT(ofs >= sizeof(struct rndis_packet_msg), 2884 ("invalid RNDIS packet msg offset %u", ofs)); 2885 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); 2886} 2887 2888static __inline void * 2889hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, 2890 size_t pi_dlen, uint32_t pi_type) 2891{ 2892 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); 2893 struct rndis_pktinfo *pi; 2894 2895 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, 2896 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); 2897 2898 /* 2899 * Per-packet-info does not move; it only grows. 2900 * 2901 * NOTE: 2902 * rm_pktinfooffset in this phase counts from the beginning 2903 * of rndis_packet_msg. 2904 */ 2905 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, 2906 ("%u pktinfo overflows RNDIS packet msg", pi_type)); 2907 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + 2908 pkt->rm_pktinfolen); 2909 pkt->rm_pktinfolen += pi_size; 2910 2911 pi->rm_size = pi_size; 2912 pi->rm_type = pi_type; 2913 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; 2914 2915 return (pi->rm_data); 2916} 2917 2918static __inline int 2919hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) 2920{ 2921 struct hn_txdesc *txd; 2922 struct mbuf *m; 2923 int error, pkts; 2924 2925 txd = txr->hn_agg_txd; 2926 KASSERT(txd != NULL, ("no aggregate txdesc")); 2927 2928 /* 2929 * Since hn_txpkt() will reset this temporary stat, save 2930 * it now, so that oerrors can be updated properly, if 2931 * hn_txpkt() ever fails. 2932 */ 2933 pkts = txr->hn_stat_pkts; 2934 2935 /* 2936 * Since txd's mbuf will _not_ be freed upon hn_txpkt() 2937 * failure, save it for later freeing, if hn_txpkt() ever 2938 * fails. 2939 */ 2940 m = txd->m; 2941 error = hn_txpkt(ifp, txr, txd); 2942 if (__predict_false(error)) { 2943 /* txd is freed, but m is not. */ 2944 m_freem(m); 2945 2946 txr->hn_flush_failed++; 2947 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); 2948 } 2949 2950 /* Reset all aggregation states. */ 2951 txr->hn_agg_txd = NULL; 2952 txr->hn_agg_szleft = 0; 2953 txr->hn_agg_pktleft = 0; 2954 txr->hn_agg_prevpkt = NULL; 2955 2956 return (error); 2957} 2958 2959static void * 2960hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 2961 int pktsize) 2962{ 2963 void *chim; 2964 2965 if (txr->hn_agg_txd != NULL) { 2966 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { 2967 struct hn_txdesc *agg_txd = txr->hn_agg_txd; 2968 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; 2969 int olen; 2970 2971 /* 2972 * Update the previous RNDIS packet's total length, 2973 * it can be increased due to the mandatory alignment 2974 * padding for this RNDIS packet. And update the 2975 * aggregating txdesc's chimney sending buffer size 2976 * accordingly. 2977 * 2978 * XXX 2979 * Zero-out the padding, as required by the RNDIS spec. 2980 */ 2981 olen = pkt->rm_len; 2982 pkt->rm_len = roundup2(olen, txr->hn_agg_align); 2983 agg_txd->chim_size += pkt->rm_len - olen; 2984 2985 /* Link this txdesc to the parent. */ 2986 hn_txdesc_agg(agg_txd, txd); 2987 2988 chim = (uint8_t *)pkt + pkt->rm_len; 2989 /* Save the current packet for later fixup. */ 2990 txr->hn_agg_prevpkt = chim; 2991 2992 txr->hn_agg_pktleft--; 2993 txr->hn_agg_szleft -= pktsize; 2994 if (txr->hn_agg_szleft <= 2995 HN_PKTSIZE_MIN(txr->hn_agg_align)) { 2996 /* 2997 * Probably can't aggregate more packets, 2998 * flush this aggregating txdesc proactively. 2999 */ 3000 txr->hn_agg_pktleft = 0; 3001 } 3002 /* Done! */ 3003 return (chim); 3004 } 3005 hn_flush_txagg(ifp, txr); 3006 } 3007 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 3008 3009 txr->hn_tx_chimney_tried++; 3010 txd->chim_index = hn_chim_alloc(txr->hn_sc); 3011 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) 3012 return (NULL); 3013 txr->hn_tx_chimney++; 3014 3015 chim = txr->hn_sc->hn_chim + 3016 (txd->chim_index * txr->hn_sc->hn_chim_szmax); 3017 3018 if (txr->hn_agg_pktmax > 1 && 3019 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { 3020 txr->hn_agg_txd = txd; 3021 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; 3022 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; 3023 txr->hn_agg_prevpkt = chim; 3024 } 3025 return (chim); 3026} 3027 3028/* 3029 * NOTE: 3030 * If this function fails, then both txd and m_head0 will be freed. 3031 */ 3032static int 3033hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, 3034 struct mbuf **m_head0) 3035{ 3036 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; 3037 int error, nsegs, i; 3038 struct mbuf *m_head = *m_head0; 3039 struct rndis_packet_msg *pkt; 3040 uint32_t *pi_data; 3041 void *chim = NULL; 3042 int pkt_hlen, pkt_size; 3043 3044 pkt = txd->rndis_pkt; 3045 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); 3046 if (pkt_size < txr->hn_chim_size) { 3047 chim = hn_try_txagg(ifp, txr, txd, pkt_size); 3048 if (chim != NULL) 3049 pkt = chim; 3050 } else { 3051 if (txr->hn_agg_txd != NULL) 3052 hn_flush_txagg(ifp, txr); 3053 } 3054 3055 pkt->rm_type = REMOTE_NDIS_PACKET_MSG; 3056 pkt->rm_len = m_head->m_pkthdr.len; 3057 pkt->rm_dataoffset = 0; 3058 pkt->rm_datalen = m_head->m_pkthdr.len; 3059 pkt->rm_oobdataoffset = 0; 3060 pkt->rm_oobdatalen = 0; 3061 pkt->rm_oobdataelements = 0; 3062 pkt->rm_pktinfooffset = sizeof(*pkt); 3063 pkt->rm_pktinfolen = 0; 3064 pkt->rm_vchandle = 0; 3065 pkt->rm_reserved = 0; 3066 3067 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { 3068 /* 3069 * Set the hash value for this packet, so that the host could 3070 * dispatch the TX done event for this packet back to this TX 3071 * ring's channel. 3072 */ 3073 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3074 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); 3075 *pi_data = txr->hn_tx_idx; 3076 } 3077 3078 if (m_head->m_flags & M_VLANTAG) { 3079 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3080 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); 3081 *pi_data = NDIS_VLAN_INFO_MAKE( 3082 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), 3083 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), 3084 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); 3085 } 3086 3087 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 3088#if defined(INET6) || defined(INET) 3089 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3090 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); 3091#ifdef INET 3092 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { 3093 *pi_data = NDIS_LSO2_INFO_MAKEIPV4( 3094 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3095 m_head->m_pkthdr.tso_segsz); 3096 } 3097#endif 3098#if defined(INET6) && defined(INET) 3099 else 3100#endif 3101#ifdef INET6 3102 { 3103 *pi_data = NDIS_LSO2_INFO_MAKEIPV6( 3104 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, 3105 m_head->m_pkthdr.tso_segsz); 3106 } 3107#endif 3108#endif /* INET6 || INET */ 3109 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { 3110 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, 3111 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); 3112 if (m_head->m_pkthdr.csum_flags & 3113 (CSUM_IP6_TCP | CSUM_IP6_UDP)) { 3114 *pi_data = NDIS_TXCSUM_INFO_IPV6; 3115 } else { 3116 *pi_data = NDIS_TXCSUM_INFO_IPV4; 3117 if (m_head->m_pkthdr.csum_flags & CSUM_IP) 3118 *pi_data |= NDIS_TXCSUM_INFO_IPCS; 3119 } 3120 3121 if (m_head->m_pkthdr.csum_flags & 3122 (CSUM_IP_TCP | CSUM_IP6_TCP)) { 3123 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( 3124 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3125 } else if (m_head->m_pkthdr.csum_flags & 3126 (CSUM_IP_UDP | CSUM_IP6_UDP)) { 3127 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( 3128 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); 3129 } 3130 } 3131 3132 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; 3133 /* Fixup RNDIS packet message total length */ 3134 pkt->rm_len += pkt_hlen; 3135 /* Convert RNDIS packet message offsets */ 3136 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); 3137 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); 3138 3139 /* 3140 * Fast path: Chimney sending. 3141 */ 3142 if (chim != NULL) { 3143 struct hn_txdesc *tgt_txd = txd; 3144 3145 if (txr->hn_agg_txd != NULL) { 3146 tgt_txd = txr->hn_agg_txd; 3147#ifdef INVARIANTS 3148 *m_head0 = NULL; 3149#endif 3150 } 3151 3152 KASSERT(pkt == chim, 3153 ("RNDIS pkt not in chimney sending buffer")); 3154 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, 3155 ("chimney sending buffer is not used")); 3156 tgt_txd->chim_size += pkt->rm_len; 3157 3158 m_copydata(m_head, 0, m_head->m_pkthdr.len, 3159 ((uint8_t *)chim) + pkt_hlen); 3160 3161 txr->hn_gpa_cnt = 0; 3162 txr->hn_sendpkt = hn_txpkt_chim; 3163 goto done; 3164 } 3165 3166 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); 3167 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, 3168 ("chimney buffer is used")); 3169 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); 3170 3171 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); 3172 if (__predict_false(error)) { 3173 int freed; 3174 3175 /* 3176 * This mbuf is not linked w/ the txd yet, so free it now. 3177 */ 3178 m_freem(m_head); 3179 *m_head0 = NULL; 3180 3181 freed = hn_txdesc_put(txr, txd); 3182 KASSERT(freed != 0, 3183 ("fail to free txd upon txdma error")); 3184 3185 txr->hn_txdma_failed++; 3186 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 3187 return error; 3188 } 3189 *m_head0 = m_head; 3190 3191 /* +1 RNDIS packet message */ 3192 txr->hn_gpa_cnt = nsegs + 1; 3193 3194 /* send packet with page buffer */ 3195 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); 3196 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; 3197 txr->hn_gpa[0].gpa_len = pkt_hlen; 3198 3199 /* 3200 * Fill the page buffers with mbuf info after the page 3201 * buffer for RNDIS packet message. 3202 */ 3203 for (i = 0; i < nsegs; ++i) { 3204 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; 3205 3206 gpa->gpa_page = atop(segs[i].ds_addr); 3207 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; 3208 gpa->gpa_len = segs[i].ds_len; 3209 } 3210 3211 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 3212 txd->chim_size = 0; 3213 txr->hn_sendpkt = hn_txpkt_sglist; 3214done: 3215 txd->m = m_head; 3216 3217 /* Set the completion routine */ 3218 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); 3219 3220 /* Update temporary stats for later use. */ 3221 txr->hn_stat_pkts++; 3222 txr->hn_stat_size += m_head->m_pkthdr.len; 3223 if (m_head->m_flags & M_MCAST) 3224 txr->hn_stat_mcasts++; 3225 3226 return 0; 3227} 3228 3229/* 3230 * NOTE: 3231 * If this function fails, then txd will be freed, but the mbuf 3232 * associated w/ the txd will _not_ be freed. 3233 */ 3234static int 3235hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) 3236{ 3237 int error, send_failed = 0, has_bpf; 3238 3239again: 3240 has_bpf = bpf_peers_present(ifp->if_bpf); 3241 if (has_bpf) { 3242 /* 3243 * Make sure that this txd and any aggregated txds are not 3244 * freed before ETHER_BPF_MTAP. 3245 */ 3246 hn_txdesc_hold(txd); 3247 } 3248 error = txr->hn_sendpkt(txr, txd); 3249 if (!error) { 3250 if (has_bpf) { 3251 const struct hn_txdesc *tmp_txd; 3252 3253 ETHER_BPF_MTAP(ifp, txd->m); 3254 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) 3255 ETHER_BPF_MTAP(ifp, tmp_txd->m); 3256 } 3257 3258 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); 3259#ifdef HN_IFSTART_SUPPORT 3260 if (!hn_use_if_start) 3261#endif 3262 { 3263 if_inc_counter(ifp, IFCOUNTER_OBYTES, 3264 txr->hn_stat_size); 3265 if (txr->hn_stat_mcasts != 0) { 3266 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 3267 txr->hn_stat_mcasts); 3268 } 3269 } 3270 txr->hn_pkts += txr->hn_stat_pkts; 3271 txr->hn_sends++; 3272 } 3273 if (has_bpf) 3274 hn_txdesc_put(txr, txd); 3275 3276 if (__predict_false(error)) { 3277 int freed; 3278 3279 /* 3280 * This should "really rarely" happen. 3281 * 3282 * XXX Too many RX to be acked or too many sideband 3283 * commands to run? Ask netvsc_channel_rollup() 3284 * to kick start later. 3285 */ 3286 txr->hn_has_txeof = 1; 3287 if (!send_failed) { 3288 txr->hn_send_failed++; 3289 send_failed = 1; 3290 /* 3291 * Try sending again after set hn_has_txeof; 3292 * in case that we missed the last 3293 * netvsc_channel_rollup(). 3294 */ 3295 goto again; 3296 } 3297 if_printf(ifp, "send failed\n"); 3298 3299 /* 3300 * Caller will perform further processing on the 3301 * associated mbuf, so don't free it in hn_txdesc_put(); 3302 * only unload it from the DMA map in hn_txdesc_put(), 3303 * if it was loaded. 3304 */ 3305 txd->m = NULL; 3306 freed = hn_txdesc_put(txr, txd); 3307 KASSERT(freed != 0, 3308 ("fail to free txd upon send error")); 3309 3310 txr->hn_send_failed++; 3311 } 3312 3313 /* Reset temporary stats, after this sending is done. */ 3314 txr->hn_stat_size = 0; 3315 txr->hn_stat_pkts = 0; 3316 txr->hn_stat_mcasts = 0; 3317 3318 return (error); 3319} 3320 3321/* 3322 * Append the specified data to the indicated mbuf chain, 3323 * Extend the mbuf chain if the new data does not fit in 3324 * existing space. 3325 * 3326 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. 3327 * There should be an equivalent in the kernel mbuf code, 3328 * but there does not appear to be one yet. 3329 * 3330 * Differs from m_append() in that additional mbufs are 3331 * allocated with cluster size MJUMPAGESIZE, and filled 3332 * accordingly. 3333 * 3334 * Return 1 if able to complete the job; otherwise 0. 3335 */ 3336static int 3337hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) 3338{ 3339 struct mbuf *m, *n; 3340 int remainder, space; 3341 3342 for (m = m0; m->m_next != NULL; m = m->m_next) 3343 ; 3344 remainder = len; 3345 space = M_TRAILINGSPACE(m); 3346 if (space > 0) { 3347 /* 3348 * Copy into available space. 3349 */ 3350 if (space > remainder) 3351 space = remainder; 3352 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 3353 m->m_len += space; 3354 cp += space; 3355 remainder -= space; 3356 } 3357 while (remainder > 0) { 3358 /* 3359 * Allocate a new mbuf; could check space 3360 * and allocate a cluster instead. 3361 */ 3362 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE); 3363 if (n == NULL) 3364 break; 3365 n->m_len = min(MJUMPAGESIZE, remainder); 3366 bcopy(cp, mtod(n, caddr_t), n->m_len); 3367 cp += n->m_len; 3368 remainder -= n->m_len; 3369 m->m_next = n; 3370 m = n; 3371 } 3372 if (m0->m_flags & M_PKTHDR) 3373 m0->m_pkthdr.len += len - remainder; 3374 3375 return (remainder == 0); 3376} 3377 3378#if defined(INET) || defined(INET6) 3379static __inline int 3380hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) 3381{ 3382#if __FreeBSD_version >= 1100095 3383 if (hn_lro_mbufq_depth) { 3384 tcp_lro_queue_mbuf(lc, m); 3385 return 0; 3386 } 3387#endif 3388 return tcp_lro_rx(lc, m, 0); 3389} 3390#endif 3391 3392static int 3393hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, 3394 const struct hn_rxinfo *info) 3395{ 3396 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp; 3397 struct mbuf *m_new; 3398 int size, do_lro = 0, do_csum = 1, is_vf = 0; 3399 int hash_type = M_HASHTYPE_NONE; 3400 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; 3401 3402 ifp = hn_ifp; 3403 if (rxr->hn_rxvf_ifp != NULL) { 3404 /* 3405 * Non-transparent mode VF; pretend this packet is from 3406 * the VF. 3407 */ 3408 ifp = rxr->hn_rxvf_ifp; 3409 is_vf = 1; 3410 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { 3411 /* Transparent mode VF. */ 3412 is_vf = 1; 3413 } 3414 3415 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { 3416 /* 3417 * NOTE: 3418 * See the NOTE of hn_rndis_init_fixat(). This 3419 * function can be reached, immediately after the 3420 * RNDIS is initialized but before the ifnet is 3421 * setup on the hn_attach() path; drop the unexpected 3422 * packets. 3423 */ 3424 return (0); 3425 } 3426 3427 if (__predict_false(dlen < ETHER_HDR_LEN)) { 3428 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); 3429 return (0); 3430 } 3431 3432 if (dlen <= MHLEN) { 3433 m_new = m_gethdr(M_NOWAIT, MT_DATA); 3434 if (m_new == NULL) { 3435 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3436 return (0); 3437 } 3438 memcpy(mtod(m_new, void *), data, dlen); 3439 m_new->m_pkthdr.len = m_new->m_len = dlen; 3440 rxr->hn_small_pkts++; 3441 } else { 3442 /* 3443 * Get an mbuf with a cluster. For packets 2K or less, 3444 * get a standard 2K cluster. For anything larger, get a 3445 * 4K cluster. Any buffers larger than 4K can cause problems 3446 * if looped around to the Hyper-V TX channel, so avoid them. 3447 */ 3448 size = MCLBYTES; 3449 if (dlen > MCLBYTES) { 3450 /* 4096 */ 3451 size = MJUMPAGESIZE; 3452 } 3453 3454 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); 3455 if (m_new == NULL) { 3456 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); 3457 return (0); 3458 } 3459 3460 hv_m_append(m_new, dlen, data); 3461 } 3462 m_new->m_pkthdr.rcvif = ifp; 3463 3464 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0)) 3465 do_csum = 0; 3466 3467 /* receive side checksum offload */ 3468 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { 3469 /* IP csum offload */ 3470 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { 3471 m_new->m_pkthdr.csum_flags |= 3472 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3473 rxr->hn_csum_ip++; 3474 } 3475 3476 /* TCP/UDP csum offload */ 3477 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | 3478 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { 3479 m_new->m_pkthdr.csum_flags |= 3480 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3481 m_new->m_pkthdr.csum_data = 0xffff; 3482 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) 3483 rxr->hn_csum_tcp++; 3484 else 3485 rxr->hn_csum_udp++; 3486 } 3487 3488 /* 3489 * XXX 3490 * As of this write (Oct 28th, 2016), host side will turn 3491 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so 3492 * the do_lro setting here is actually _not_ accurate. We 3493 * depend on the RSS hash type check to reset do_lro. 3494 */ 3495 if ((info->csum_info & 3496 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == 3497 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) 3498 do_lro = 1; 3499 } else { 3500 hn_rxpkt_proto(m_new, &l3proto, &l4proto); 3501 if (l3proto == ETHERTYPE_IP) { 3502 if (l4proto == IPPROTO_TCP) { 3503 if (do_csum && 3504 (rxr->hn_trust_hcsum & 3505 HN_TRUST_HCSUM_TCP)) { 3506 rxr->hn_csum_trusted++; 3507 m_new->m_pkthdr.csum_flags |= 3508 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3509 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3510 m_new->m_pkthdr.csum_data = 0xffff; 3511 } 3512 do_lro = 1; 3513 } else if (l4proto == IPPROTO_UDP) { 3514 if (do_csum && 3515 (rxr->hn_trust_hcsum & 3516 HN_TRUST_HCSUM_UDP)) { 3517 rxr->hn_csum_trusted++; 3518 m_new->m_pkthdr.csum_flags |= 3519 (CSUM_IP_CHECKED | CSUM_IP_VALID | 3520 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 3521 m_new->m_pkthdr.csum_data = 0xffff; 3522 } 3523 } else if (l4proto != IPPROTO_DONE && do_csum && 3524 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { 3525 rxr->hn_csum_trusted++; 3526 m_new->m_pkthdr.csum_flags |= 3527 (CSUM_IP_CHECKED | CSUM_IP_VALID); 3528 } 3529 } 3530 } 3531 3532 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { 3533 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( 3534 NDIS_VLAN_INFO_ID(info->vlan_info), 3535 NDIS_VLAN_INFO_PRI(info->vlan_info), 3536 NDIS_VLAN_INFO_CFI(info->vlan_info)); 3537 m_new->m_flags |= M_VLANTAG; 3538 } 3539 3540 /* 3541 * If VF is activated (tranparent/non-transparent mode does not 3542 * matter here). 3543 * 3544 * - Disable LRO 3545 * 3546 * hn(4) will only receive broadcast packets, multicast packets, 3547 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these 3548 * packet types. 3549 * 3550 * For non-transparent, we definitely _cannot_ enable LRO at 3551 * all, since the LRO flush will use hn(4) as the receiving 3552 * interface; i.e. hn_ifp->if_input(hn_ifp, m). 3553 */ 3554 if (is_vf) 3555 do_lro = 0; 3556 3557 /* 3558 * If VF is activated (tranparent/non-transparent mode does not 3559 * matter here), do _not_ mess with unsupported hash types or 3560 * functions. 3561 */ 3562 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { 3563 rxr->hn_rss_pkts++; 3564 m_new->m_pkthdr.flowid = info->hash_value; 3565 if (!is_vf) 3566 hash_type = M_HASHTYPE_OPAQUE; 3567 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == 3568 NDIS_HASH_FUNCTION_TOEPLITZ) { 3569 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK & 3570 rxr->hn_mbuf_hash); 3571 3572 /* 3573 * NOTE: 3574 * do_lro is resetted, if the hash types are not TCP 3575 * related. See the comment in the above csum_flags 3576 * setup section. 3577 */ 3578 switch (type) { 3579 case NDIS_HASH_IPV4: 3580 hash_type = M_HASHTYPE_RSS_IPV4; 3581 do_lro = 0; 3582 break; 3583 3584 case NDIS_HASH_TCP_IPV4: 3585 hash_type = M_HASHTYPE_RSS_TCP_IPV4; 3586 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { 3587 int def_htype = M_HASHTYPE_OPAQUE; 3588 3589 if (is_vf) 3590 def_htype = M_HASHTYPE_NONE; 3591 3592 /* 3593 * UDP 4-tuple hash is delivered as 3594 * TCP 4-tuple hash. 3595 */ 3596 if (l3proto == ETHERTYPE_MAX) { 3597 hn_rxpkt_proto(m_new, 3598 &l3proto, &l4proto); 3599 } 3600 if (l3proto == ETHERTYPE_IP) { 3601 if (l4proto == IPPROTO_UDP && 3602 (rxr->hn_mbuf_hash & 3603 NDIS_HASH_UDP_IPV4_X)) { 3604 hash_type = 3605 M_HASHTYPE_RSS_UDP_IPV4; 3606 do_lro = 0; 3607 } else if (l4proto != 3608 IPPROTO_TCP) { 3609 hash_type = def_htype; 3610 do_lro = 0; 3611 } 3612 } else { 3613 hash_type = def_htype; 3614 do_lro = 0; 3615 } 3616 } 3617 break; 3618 3619 case NDIS_HASH_IPV6: 3620 hash_type = M_HASHTYPE_RSS_IPV6; 3621 do_lro = 0; 3622 break; 3623 3624 case NDIS_HASH_IPV6_EX: 3625 hash_type = M_HASHTYPE_RSS_IPV6_EX; 3626 do_lro = 0; 3627 break; 3628 3629 case NDIS_HASH_TCP_IPV6: 3630 hash_type = M_HASHTYPE_RSS_TCP_IPV6; 3631 break; 3632 3633 case NDIS_HASH_TCP_IPV6_EX: 3634 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; 3635 break; 3636 } 3637 } 3638 } else if (!is_vf) { 3639 m_new->m_pkthdr.flowid = rxr->hn_rx_idx; 3640 } 3641 M_HASHTYPE_SET(m_new, hash_type); 3642 3643 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); 3644 if (hn_ifp != ifp) { 3645 const struct ether_header *eh; 3646 3647 /* 3648 * Non-transparent mode VF is activated. 3649 */ 3650 3651 /* 3652 * Allow tapping on hn(4). 3653 */ 3654 ETHER_BPF_MTAP(hn_ifp, m_new); 3655 3656 /* 3657 * Update hn(4)'s stats. 3658 */ 3659 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); 3660 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); 3661 /* Checked at the beginning of this function. */ 3662 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); 3663 eh = mtod(m_new, struct ether_header *); 3664 if (ETHER_IS_MULTICAST(eh->ether_dhost)) 3665 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); 3666 } 3667 rxr->hn_pkts++; 3668 3669 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) { 3670#if defined(INET) || defined(INET6) 3671 struct lro_ctrl *lro = &rxr->hn_lro; 3672 3673 if (lro->lro_cnt) { 3674 rxr->hn_lro_tried++; 3675 if (hn_lro_rx(lro, m_new) == 0) { 3676 /* DONE! */ 3677 return 0; 3678 } 3679 } 3680#endif 3681 } 3682 ifp->if_input(ifp, m_new); 3683 3684 return (0); 3685} 3686 3687static int 3688hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) 3689{ 3690 struct hn_softc *sc = ifp->if_softc; 3691 struct ifreq *ifr = (struct ifreq *)data, ifr_vf; 3692 struct ifnet *vf_ifp; 3693 int mask, error = 0; 3694 struct ifrsskey *ifrk; 3695 struct ifrsshash *ifrh; 3696 uint32_t mtu; 3697 3698 switch (cmd) { 3699 case SIOCSIFMTU: 3700 if (ifr->ifr_mtu > HN_MTU_MAX) { 3701 error = EINVAL; 3702 break; 3703 } 3704 3705 HN_LOCK(sc); 3706 3707 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3708 HN_UNLOCK(sc); 3709 break; 3710 } 3711 3712 if ((sc->hn_caps & HN_CAP_MTU) == 0) { 3713 /* Can't change MTU */ 3714 HN_UNLOCK(sc); 3715 error = EOPNOTSUPP; 3716 break; 3717 } 3718 3719 if (ifp->if_mtu == ifr->ifr_mtu) { 3720 HN_UNLOCK(sc); 3721 break; 3722 } 3723 3724 if (hn_xpnt_vf_isready(sc)) { 3725 vf_ifp = sc->hn_vf_ifp; 3726 ifr_vf = *ifr; 3727 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname, 3728 sizeof(ifr_vf.ifr_name)); 3729 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, 3730 (caddr_t)&ifr_vf); 3731 if (error) { 3732 HN_UNLOCK(sc); 3733 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", 3734 vf_ifp->if_xname, ifr->ifr_mtu, error); 3735 break; 3736 } 3737 } 3738 3739 /* 3740 * Suspend this interface before the synthetic parts 3741 * are ripped. 3742 */ 3743 hn_suspend(sc); 3744 3745 /* 3746 * Detach the synthetics parts, i.e. NVS and RNDIS. 3747 */ 3748 hn_synth_detach(sc); 3749 3750 /* 3751 * Reattach the synthetic parts, i.e. NVS and RNDIS, 3752 * with the new MTU setting. 3753 */ 3754 error = hn_synth_attach(sc, ifr->ifr_mtu); 3755 if (error) { 3756 HN_UNLOCK(sc); 3757 break; 3758 } 3759 3760 error = hn_rndis_get_mtu(sc, &mtu); 3761 if (error) 3762 mtu = ifr->ifr_mtu; 3763 else if (bootverbose) 3764 if_printf(ifp, "RNDIS mtu %u\n", mtu); 3765 3766 /* 3767 * Commit the requested MTU, after the synthetic parts 3768 * have been successfully attached. 3769 */ 3770 if (mtu >= ifr->ifr_mtu) { 3771 mtu = ifr->ifr_mtu; 3772 } else { 3773 if_printf(ifp, "fixup mtu %d -> %u\n", 3774 ifr->ifr_mtu, mtu); 3775 } 3776 ifp->if_mtu = mtu; 3777 3778 /* 3779 * Synthetic parts' reattach may change the chimney 3780 * sending size; update it. 3781 */ 3782 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) 3783 hn_set_chim_size(sc, sc->hn_chim_szmax); 3784 3785 /* 3786 * Make sure that various parameters based on MTU are 3787 * still valid, after the MTU change. 3788 */ 3789 hn_mtu_change_fixup(sc); 3790 3791 /* 3792 * All done! Resume the interface now. 3793 */ 3794 hn_resume(sc); 3795 3796 if ((sc->hn_flags & HN_FLAG_RXVF) || 3797 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 3798 /* 3799 * Since we have reattached the NVS part, 3800 * change the datapath to VF again; in case 3801 * that it is lost, after the NVS was detached. 3802 */ 3803 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); 3804 } 3805 3806 HN_UNLOCK(sc); 3807 break; 3808 3809 case SIOCSIFFLAGS: 3810 HN_LOCK(sc); 3811 3812 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3813 HN_UNLOCK(sc); 3814 break; 3815 } 3816 3817 if (hn_xpnt_vf_isready(sc)) 3818 hn_xpnt_vf_saveifflags(sc); 3819 3820 if (ifp->if_flags & IFF_UP) { 3821 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3822 /* 3823 * Caller meight hold mutex, e.g. 3824 * bpf; use busy-wait for the RNDIS 3825 * reply. 3826 */ 3827 HN_NO_SLEEPING(sc); 3828 hn_rxfilter_config(sc); 3829 HN_SLEEPING_OK(sc); 3830 3831 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 3832 error = hn_xpnt_vf_iocsetflags(sc); 3833 } else { 3834 hn_init_locked(sc); 3835 } 3836 } else { 3837 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 3838 hn_stop(sc, false); 3839 } 3840 sc->hn_if_flags = ifp->if_flags; 3841 3842 HN_UNLOCK(sc); 3843 break; 3844 3845 case SIOCSIFCAP: 3846 HN_LOCK(sc); 3847 3848 if (hn_xpnt_vf_isready(sc)) { 3849 ifr_vf = *ifr; 3850 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname, 3851 sizeof(ifr_vf.ifr_name)); 3852 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); 3853 HN_UNLOCK(sc); 3854 break; 3855 } 3856 3857 /* 3858 * Fix up requested capabilities w/ supported capabilities, 3859 * since the supported capabilities could have been changed. 3860 */ 3861 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^ 3862 ifp->if_capenable; 3863 3864 if (mask & IFCAP_TXCSUM) { 3865 ifp->if_capenable ^= IFCAP_TXCSUM; 3866 if (ifp->if_capenable & IFCAP_TXCSUM) 3867 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); 3868 else 3869 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); 3870 } 3871 if (mask & IFCAP_TXCSUM_IPV6) { 3872 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; 3873 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) 3874 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); 3875 else 3876 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); 3877 } 3878 3879 /* TODO: flip RNDIS offload parameters for RXCSUM. */ 3880 if (mask & IFCAP_RXCSUM) 3881 ifp->if_capenable ^= IFCAP_RXCSUM; 3882#ifdef foo 3883 /* We can't diff IPv6 packets from IPv4 packets on RX path. */ 3884 if (mask & IFCAP_RXCSUM_IPV6) 3885 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; 3886#endif 3887 3888 if (mask & IFCAP_LRO) 3889 ifp->if_capenable ^= IFCAP_LRO; 3890 3891 if (mask & IFCAP_TSO4) { 3892 ifp->if_capenable ^= IFCAP_TSO4; 3893 if (ifp->if_capenable & IFCAP_TSO4) 3894 ifp->if_hwassist |= CSUM_IP_TSO; 3895 else 3896 ifp->if_hwassist &= ~CSUM_IP_TSO; 3897 } 3898 if (mask & IFCAP_TSO6) { 3899 ifp->if_capenable ^= IFCAP_TSO6; 3900 if (ifp->if_capenable & IFCAP_TSO6) 3901 ifp->if_hwassist |= CSUM_IP6_TSO; 3902 else 3903 ifp->if_hwassist &= ~CSUM_IP6_TSO; 3904 } 3905 3906 HN_UNLOCK(sc); 3907 break; 3908 3909 case SIOCADDMULTI: 3910 case SIOCDELMULTI: 3911 HN_LOCK(sc); 3912 3913 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { 3914 HN_UNLOCK(sc); 3915 break; 3916 } 3917 if (ifp->if_drv_flags & IFF_DRV_RUNNING) { 3918 /* 3919 * Multicast uses mutex; use busy-wait for 3920 * the RNDIS reply. 3921 */ 3922 HN_NO_SLEEPING(sc); 3923 hn_rxfilter_config(sc); 3924 HN_SLEEPING_OK(sc); 3925 } 3926 3927 /* XXX vlan(4) style mcast addr maintenance */ 3928 if (hn_xpnt_vf_isready(sc)) { 3929 int old_if_flags; 3930 3931 old_if_flags = sc->hn_vf_ifp->if_flags; 3932 hn_xpnt_vf_saveifflags(sc); 3933 3934 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && 3935 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) & 3936 IFF_ALLMULTI)) 3937 error = hn_xpnt_vf_iocsetflags(sc); 3938 } 3939 3940 HN_UNLOCK(sc); 3941 break; 3942 3943 case SIOCSIFMEDIA: 3944 case SIOCGIFMEDIA: 3945 HN_LOCK(sc); 3946 if (hn_xpnt_vf_isready(sc)) { 3947 /* 3948 * SIOCGIFMEDIA expects ifmediareq, so don't 3949 * create and pass ifr_vf to the VF here; just 3950 * replace the ifr_name. 3951 */ 3952 vf_ifp = sc->hn_vf_ifp; 3953 strlcpy(ifr->ifr_name, vf_ifp->if_xname, 3954 sizeof(ifr->ifr_name)); 3955 error = vf_ifp->if_ioctl(vf_ifp, cmd, data); 3956 /* Restore the ifr_name. */ 3957 strlcpy(ifr->ifr_name, ifp->if_xname, 3958 sizeof(ifr->ifr_name)); 3959 HN_UNLOCK(sc); 3960 break; 3961 } 3962 HN_UNLOCK(sc); 3963 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); 3964 break; 3965 3966 case SIOCGIFRSSHASH: 3967 ifrh = (struct ifrsshash *)data; 3968 HN_LOCK(sc); 3969 if (sc->hn_rx_ring_inuse == 1) { 3970 HN_UNLOCK(sc); 3971 ifrh->ifrh_func = RSS_FUNC_NONE; 3972 ifrh->ifrh_types = 0; 3973 break; 3974 } 3975 3976 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 3977 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; 3978 else 3979 ifrh->ifrh_func = RSS_FUNC_PRIVATE; 3980 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); 3981 HN_UNLOCK(sc); 3982 break; 3983 3984 case SIOCGIFRSSKEY: 3985 ifrk = (struct ifrsskey *)data; 3986 HN_LOCK(sc); 3987 if (sc->hn_rx_ring_inuse == 1) { 3988 HN_UNLOCK(sc); 3989 ifrk->ifrk_func = RSS_FUNC_NONE; 3990 ifrk->ifrk_keylen = 0; 3991 break; 3992 } 3993 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) 3994 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; 3995 else 3996 ifrk->ifrk_func = RSS_FUNC_PRIVATE; 3997 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; 3998 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, 3999 NDIS_HASH_KEYSIZE_TOEPLITZ); 4000 HN_UNLOCK(sc); 4001 break; 4002 4003 default: 4004 error = ether_ioctl(ifp, cmd, data); 4005 break; 4006 } 4007 return (error); 4008} 4009 4010static void 4011hn_stop(struct hn_softc *sc, bool detaching) 4012{ 4013 struct ifnet *ifp = sc->hn_ifp; 4014 int i; 4015 4016 HN_LOCK_ASSERT(sc); 4017 4018 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 4019 ("synthetic parts were not attached")); 4020 4021 /* Clear RUNNING bit ASAP. */ 4022 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4023 4024 /* Disable polling. */ 4025 hn_polling(sc, 0); 4026 4027 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 4028 KASSERT(sc->hn_vf_ifp != NULL, 4029 ("%s: VF is not attached", ifp->if_xname)); 4030 4031 /* Mark transparent mode VF as disabled. */ 4032 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); 4033 4034 /* 4035 * NOTE: 4036 * Datapath setting must happen _before_ bringing 4037 * the VF down. 4038 */ 4039 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); 4040 4041 /* 4042 * Bring the VF down. 4043 */ 4044 hn_xpnt_vf_saveifflags(sc); 4045 sc->hn_vf_ifp->if_flags &= ~IFF_UP; 4046 hn_xpnt_vf_iocsetflags(sc); 4047 } 4048 4049 /* Suspend data transfers. */ 4050 hn_suspend_data(sc); 4051 4052 /* Clear OACTIVE bit. */ 4053 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4054 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4055 sc->hn_tx_ring[i].hn_oactive = 0; 4056 4057 /* 4058 * If the non-transparent mode VF is active, make sure 4059 * that the RX filter still allows packet reception. 4060 */ 4061 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) 4062 hn_rxfilter_config(sc); 4063} 4064 4065static void 4066hn_init_locked(struct hn_softc *sc) 4067{ 4068 struct ifnet *ifp = sc->hn_ifp; 4069 int i; 4070 4071 HN_LOCK_ASSERT(sc); 4072 4073 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) 4074 return; 4075 4076 if (ifp->if_drv_flags & IFF_DRV_RUNNING) 4077 return; 4078 4079 /* Configure RX filter */ 4080 hn_rxfilter_config(sc); 4081 4082 /* Clear OACTIVE bit. */ 4083 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 4084 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 4085 sc->hn_tx_ring[i].hn_oactive = 0; 4086 4087 /* Clear TX 'suspended' bit. */ 4088 hn_resume_tx(sc, sc->hn_tx_ring_inuse); 4089 4090 if (hn_xpnt_vf_isready(sc)) { 4091 /* Initialize transparent VF. */ 4092 hn_xpnt_vf_init(sc); 4093 } 4094 4095 /* Everything is ready; unleash! */ 4096 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); 4097 4098 /* Re-enable polling if requested. */ 4099 if (sc->hn_pollhz > 0) 4100 hn_polling(sc, sc->hn_pollhz); 4101} 4102 4103static void 4104hn_init(void *xsc) 4105{ 4106 struct hn_softc *sc = xsc; 4107 4108 HN_LOCK(sc); 4109 hn_init_locked(sc); 4110 HN_UNLOCK(sc); 4111} 4112 4113#if __FreeBSD_version >= 1100099 4114 4115static int 4116hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) 4117{ 4118 struct hn_softc *sc = arg1; 4119 unsigned int lenlim; 4120 int error; 4121 4122 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; 4123 error = sysctl_handle_int(oidp, &lenlim, 0, req); 4124 if (error || req->newptr == NULL) 4125 return error; 4126 4127 HN_LOCK(sc); 4128 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || 4129 lenlim > TCP_LRO_LENGTH_MAX) { 4130 HN_UNLOCK(sc); 4131 return EINVAL; 4132 } 4133 hn_set_lro_lenlim(sc, lenlim); 4134 HN_UNLOCK(sc); 4135 4136 return 0; 4137} 4138 4139static int 4140hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) 4141{ 4142 struct hn_softc *sc = arg1; 4143 int ackcnt, error, i; 4144 4145 /* 4146 * lro_ackcnt_lim is append count limit, 4147 * +1 to turn it into aggregation limit. 4148 */ 4149 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; 4150 error = sysctl_handle_int(oidp, &ackcnt, 0, req); 4151 if (error || req->newptr == NULL) 4152 return error; 4153 4154 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) 4155 return EINVAL; 4156 4157 /* 4158 * Convert aggregation limit back to append 4159 * count limit. 4160 */ 4161 --ackcnt; 4162 HN_LOCK(sc); 4163 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 4164 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; 4165 HN_UNLOCK(sc); 4166 return 0; 4167} 4168 4169#endif 4170 4171static int 4172hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) 4173{ 4174 struct hn_softc *sc = arg1; 4175 int hcsum = arg2; 4176 int on, error, i; 4177 4178 on = 0; 4179 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) 4180 on = 1; 4181 4182 error = sysctl_handle_int(oidp, &on, 0, req); 4183 if (error || req->newptr == NULL) 4184 return error; 4185 4186 HN_LOCK(sc); 4187 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4188 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4189 4190 if (on) 4191 rxr->hn_trust_hcsum |= hcsum; 4192 else 4193 rxr->hn_trust_hcsum &= ~hcsum; 4194 } 4195 HN_UNLOCK(sc); 4196 return 0; 4197} 4198 4199static int 4200hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) 4201{ 4202 struct hn_softc *sc = arg1; 4203 int chim_size, error; 4204 4205 chim_size = sc->hn_tx_ring[0].hn_chim_size; 4206 error = sysctl_handle_int(oidp, &chim_size, 0, req); 4207 if (error || req->newptr == NULL) 4208 return error; 4209 4210 if (chim_size > sc->hn_chim_szmax || chim_size <= 0) 4211 return EINVAL; 4212 4213 HN_LOCK(sc); 4214 hn_set_chim_size(sc, chim_size); 4215 HN_UNLOCK(sc); 4216 return 0; 4217} 4218 4219#if __FreeBSD_version < 1100095 4220static int 4221hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) 4222{ 4223 struct hn_softc *sc = arg1; 4224 int ofs = arg2, i, error; 4225 struct hn_rx_ring *rxr; 4226 uint64_t stat; 4227 4228 stat = 0; 4229 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { 4230 rxr = &sc->hn_rx_ring[i]; 4231 stat += *((int *)((uint8_t *)rxr + ofs)); 4232 } 4233 4234 error = sysctl_handle_64(oidp, &stat, 0, req); 4235 if (error || req->newptr == NULL) 4236 return error; 4237 4238 /* Zero out this stat. */ 4239 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { 4240 rxr = &sc->hn_rx_ring[i]; 4241 *((int *)((uint8_t *)rxr + ofs)) = 0; 4242 } 4243 return 0; 4244} 4245#else 4246static int 4247hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) 4248{ 4249 struct hn_softc *sc = arg1; 4250 int ofs = arg2, i, error; 4251 struct hn_rx_ring *rxr; 4252 uint64_t stat; 4253 4254 stat = 0; 4255 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4256 rxr = &sc->hn_rx_ring[i]; 4257 stat += *((uint64_t *)((uint8_t *)rxr + ofs)); 4258 } 4259 4260 error = sysctl_handle_64(oidp, &stat, 0, req); 4261 if (error || req->newptr == NULL) 4262 return error; 4263 4264 /* Zero out this stat. */ 4265 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4266 rxr = &sc->hn_rx_ring[i]; 4267 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; 4268 } 4269 return 0; 4270} 4271 4272#endif 4273 4274static int 4275hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4276{ 4277 struct hn_softc *sc = arg1; 4278 int ofs = arg2, i, error; 4279 struct hn_rx_ring *rxr; 4280 u_long stat; 4281 4282 stat = 0; 4283 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4284 rxr = &sc->hn_rx_ring[i]; 4285 stat += *((u_long *)((uint8_t *)rxr + ofs)); 4286 } 4287 4288 error = sysctl_handle_long(oidp, &stat, 0, req); 4289 if (error || req->newptr == NULL) 4290 return error; 4291 4292 /* Zero out this stat. */ 4293 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4294 rxr = &sc->hn_rx_ring[i]; 4295 *((u_long *)((uint8_t *)rxr + ofs)) = 0; 4296 } 4297 return 0; 4298} 4299 4300static int 4301hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) 4302{ 4303 struct hn_softc *sc = arg1; 4304 int ofs = arg2, i, error; 4305 struct hn_tx_ring *txr; 4306 u_long stat; 4307 4308 stat = 0; 4309 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4310 txr = &sc->hn_tx_ring[i]; 4311 stat += *((u_long *)((uint8_t *)txr + ofs)); 4312 } 4313 4314 error = sysctl_handle_long(oidp, &stat, 0, req); 4315 if (error || req->newptr == NULL) 4316 return error; 4317 4318 /* Zero out this stat. */ 4319 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4320 txr = &sc->hn_tx_ring[i]; 4321 *((u_long *)((uint8_t *)txr + ofs)) = 0; 4322 } 4323 return 0; 4324} 4325 4326static int 4327hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) 4328{ 4329 struct hn_softc *sc = arg1; 4330 int ofs = arg2, i, error, conf; 4331 struct hn_tx_ring *txr; 4332 4333 txr = &sc->hn_tx_ring[0]; 4334 conf = *((int *)((uint8_t *)txr + ofs)); 4335 4336 error = sysctl_handle_int(oidp, &conf, 0, req); 4337 if (error || req->newptr == NULL) 4338 return error; 4339 4340 HN_LOCK(sc); 4341 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 4342 txr = &sc->hn_tx_ring[i]; 4343 *((int *)((uint8_t *)txr + ofs)) = conf; 4344 } 4345 HN_UNLOCK(sc); 4346 4347 return 0; 4348} 4349 4350static int 4351hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) 4352{ 4353 struct hn_softc *sc = arg1; 4354 int error, size; 4355 4356 size = sc->hn_agg_size; 4357 error = sysctl_handle_int(oidp, &size, 0, req); 4358 if (error || req->newptr == NULL) 4359 return (error); 4360 4361 HN_LOCK(sc); 4362 sc->hn_agg_size = size; 4363 hn_set_txagg(sc); 4364 HN_UNLOCK(sc); 4365 4366 return (0); 4367} 4368 4369static int 4370hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) 4371{ 4372 struct hn_softc *sc = arg1; 4373 int error, pkts; 4374 4375 pkts = sc->hn_agg_pkts; 4376 error = sysctl_handle_int(oidp, &pkts, 0, req); 4377 if (error || req->newptr == NULL) 4378 return (error); 4379 4380 HN_LOCK(sc); 4381 sc->hn_agg_pkts = pkts; 4382 hn_set_txagg(sc); 4383 HN_UNLOCK(sc); 4384 4385 return (0); 4386} 4387 4388static int 4389hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) 4390{ 4391 struct hn_softc *sc = arg1; 4392 int pkts; 4393 4394 pkts = sc->hn_tx_ring[0].hn_agg_pktmax; 4395 return (sysctl_handle_int(oidp, &pkts, 0, req)); 4396} 4397 4398static int 4399hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) 4400{ 4401 struct hn_softc *sc = arg1; 4402 int align; 4403 4404 align = sc->hn_tx_ring[0].hn_agg_align; 4405 return (sysctl_handle_int(oidp, &align, 0, req)); 4406} 4407 4408static void 4409hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) 4410{ 4411 if (pollhz == 0) 4412 vmbus_chan_poll_disable(chan); 4413 else 4414 vmbus_chan_poll_enable(chan, pollhz); 4415} 4416 4417static void 4418hn_polling(struct hn_softc *sc, u_int pollhz) 4419{ 4420 int nsubch = sc->hn_rx_ring_inuse - 1; 4421 4422 HN_LOCK_ASSERT(sc); 4423 4424 if (nsubch > 0) { 4425 struct vmbus_channel **subch; 4426 int i; 4427 4428 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 4429 for (i = 0; i < nsubch; ++i) 4430 hn_chan_polling(subch[i], pollhz); 4431 vmbus_subchan_rel(subch, nsubch); 4432 } 4433 hn_chan_polling(sc->hn_prichan, pollhz); 4434} 4435 4436static int 4437hn_polling_sysctl(SYSCTL_HANDLER_ARGS) 4438{ 4439 struct hn_softc *sc = arg1; 4440 int pollhz, error; 4441 4442 pollhz = sc->hn_pollhz; 4443 error = sysctl_handle_int(oidp, &pollhz, 0, req); 4444 if (error || req->newptr == NULL) 4445 return (error); 4446 4447 if (pollhz != 0 && 4448 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) 4449 return (EINVAL); 4450 4451 HN_LOCK(sc); 4452 if (sc->hn_pollhz != pollhz) { 4453 sc->hn_pollhz = pollhz; 4454 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && 4455 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) 4456 hn_polling(sc, sc->hn_pollhz); 4457 } 4458 HN_UNLOCK(sc); 4459 4460 return (0); 4461} 4462 4463static int 4464hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) 4465{ 4466 struct hn_softc *sc = arg1; 4467 char verstr[16]; 4468 4469 snprintf(verstr, sizeof(verstr), "%u.%u", 4470 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), 4471 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); 4472 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); 4473} 4474 4475static int 4476hn_caps_sysctl(SYSCTL_HANDLER_ARGS) 4477{ 4478 struct hn_softc *sc = arg1; 4479 char caps_str[128]; 4480 uint32_t caps; 4481 4482 HN_LOCK(sc); 4483 caps = sc->hn_caps; 4484 HN_UNLOCK(sc); 4485 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); 4486 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); 4487} 4488 4489static int 4490hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) 4491{ 4492 struct hn_softc *sc = arg1; 4493 char assist_str[128]; 4494 uint32_t hwassist; 4495 4496 HN_LOCK(sc); 4497 hwassist = sc->hn_ifp->if_hwassist; 4498 HN_UNLOCK(sc); 4499 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); 4500 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); 4501} 4502 4503static int 4504hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) 4505{ 4506 struct hn_softc *sc = arg1; 4507 char filter_str[128]; 4508 uint32_t filter; 4509 4510 HN_LOCK(sc); 4511 filter = sc->hn_rx_filter; 4512 HN_UNLOCK(sc); 4513 snprintf(filter_str, sizeof(filter_str), "%b", filter, 4514 NDIS_PACKET_TYPES); 4515 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); 4516} 4517 4518static int 4519hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) 4520{ 4521 struct hn_softc *sc = arg1; 4522 int error; 4523 4524 HN_LOCK(sc); 4525 4526 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4527 if (error || req->newptr == NULL) 4528 goto back; 4529 4530 if ((sc->hn_flags & HN_FLAG_RXVF) || 4531 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { 4532 /* 4533 * RSS key is synchronized w/ VF's, don't allow users 4534 * to change it. 4535 */ 4536 error = EBUSY; 4537 goto back; 4538 } 4539 4540 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); 4541 if (error) 4542 goto back; 4543 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 4544 4545 if (sc->hn_rx_ring_inuse > 1) { 4546 error = hn_rss_reconfig(sc); 4547 } else { 4548 /* Not RSS capable, at least for now; just save the RSS key. */ 4549 error = 0; 4550 } 4551back: 4552 HN_UNLOCK(sc); 4553 return (error); 4554} 4555 4556static int 4557hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) 4558{ 4559 struct hn_softc *sc = arg1; 4560 int error; 4561 4562 HN_LOCK(sc); 4563 4564 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4565 if (error || req->newptr == NULL) 4566 goto back; 4567 4568 /* 4569 * Don't allow RSS indirect table change, if this interface is not 4570 * RSS capable currently. 4571 */ 4572 if (sc->hn_rx_ring_inuse == 1) { 4573 error = EOPNOTSUPP; 4574 goto back; 4575 } 4576 4577 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); 4578 if (error) 4579 goto back; 4580 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 4581 4582 hn_rss_ind_fixup(sc); 4583 error = hn_rss_reconfig(sc); 4584back: 4585 HN_UNLOCK(sc); 4586 return (error); 4587} 4588 4589static int 4590hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) 4591{ 4592 struct hn_softc *sc = arg1; 4593 char hash_str[128]; 4594 uint32_t hash; 4595 4596 HN_LOCK(sc); 4597 hash = sc->hn_rss_hash; 4598 HN_UNLOCK(sc); 4599 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4600 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4601} 4602 4603static int 4604hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) 4605{ 4606 struct hn_softc *sc = arg1; 4607 char hash_str[128]; 4608 uint32_t hash; 4609 4610 HN_LOCK(sc); 4611 hash = sc->hn_rss_hcap; 4612 HN_UNLOCK(sc); 4613 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4614 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4615} 4616 4617static int 4618hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) 4619{ 4620 struct hn_softc *sc = arg1; 4621 char hash_str[128]; 4622 uint32_t hash; 4623 4624 HN_LOCK(sc); 4625 hash = sc->hn_rx_ring[0].hn_mbuf_hash; 4626 HN_UNLOCK(sc); 4627 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); 4628 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); 4629} 4630 4631static int 4632hn_vf_sysctl(SYSCTL_HANDLER_ARGS) 4633{ 4634 struct hn_softc *sc = arg1; 4635 char vf_name[IFNAMSIZ + 1]; 4636 struct ifnet *vf_ifp; 4637 4638 HN_LOCK(sc); 4639 vf_name[0] = '\0'; 4640 vf_ifp = sc->hn_vf_ifp; 4641 if (vf_ifp != NULL) 4642 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4643 HN_UNLOCK(sc); 4644 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4645} 4646 4647static int 4648hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) 4649{ 4650 struct hn_softc *sc = arg1; 4651 char vf_name[IFNAMSIZ + 1]; 4652 struct ifnet *vf_ifp; 4653 4654 HN_LOCK(sc); 4655 vf_name[0] = '\0'; 4656 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; 4657 if (vf_ifp != NULL) 4658 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); 4659 HN_UNLOCK(sc); 4660 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); 4661} 4662 4663static int 4664hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) 4665{ 4666 struct rm_priotracker pt; 4667 struct sbuf *sb; 4668 int error, i; 4669 bool first; 4670 4671 error = sysctl_wire_old_buffer(req, 0); 4672 if (error != 0) 4673 return (error); 4674 4675 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4676 if (sb == NULL) 4677 return (ENOMEM); 4678 4679 rm_rlock(&hn_vfmap_lock, &pt); 4680 4681 first = true; 4682 for (i = 0; i < hn_vfmap_size; ++i) { 4683 struct ifnet *ifp; 4684 4685 if (hn_vfmap[i] == NULL) 4686 continue; 4687 4688 ifp = ifnet_byindex(i); 4689 if (ifp != NULL) { 4690 if (first) 4691 sbuf_printf(sb, "%s", ifp->if_xname); 4692 else 4693 sbuf_printf(sb, " %s", ifp->if_xname); 4694 first = false; 4695 } 4696 } 4697 4698 rm_runlock(&hn_vfmap_lock, &pt); 4699 4700 error = sbuf_finish(sb); 4701 sbuf_delete(sb); 4702 return (error); 4703} 4704 4705static int 4706hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) 4707{ 4708 struct rm_priotracker pt; 4709 struct sbuf *sb; 4710 int error, i; 4711 bool first; 4712 4713 error = sysctl_wire_old_buffer(req, 0); 4714 if (error != 0) 4715 return (error); 4716 4717 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); 4718 if (sb == NULL) 4719 return (ENOMEM); 4720 4721 rm_rlock(&hn_vfmap_lock, &pt); 4722 4723 first = true; 4724 for (i = 0; i < hn_vfmap_size; ++i) { 4725 struct ifnet *ifp, *hn_ifp; 4726 4727 hn_ifp = hn_vfmap[i]; 4728 if (hn_ifp == NULL) 4729 continue; 4730 4731 ifp = ifnet_byindex(i); 4732 if (ifp != NULL) { 4733 if (first) { 4734 sbuf_printf(sb, "%s:%s", ifp->if_xname, 4735 hn_ifp->if_xname); 4736 } else { 4737 sbuf_printf(sb, " %s:%s", ifp->if_xname, 4738 hn_ifp->if_xname); 4739 } 4740 first = false; 4741 } 4742 } 4743 4744 rm_runlock(&hn_vfmap_lock, &pt); 4745 4746 error = sbuf_finish(sb); 4747 sbuf_delete(sb); 4748 return (error); 4749} 4750 4751static int 4752hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) 4753{ 4754 struct hn_softc *sc = arg1; 4755 int error, onoff = 0; 4756 4757 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) 4758 onoff = 1; 4759 error = sysctl_handle_int(oidp, &onoff, 0, req); 4760 if (error || req->newptr == NULL) 4761 return (error); 4762 4763 HN_LOCK(sc); 4764 /* NOTE: hn_vf_lock for hn_transmit() */ 4765 rm_wlock(&sc->hn_vf_lock); 4766 if (onoff) 4767 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; 4768 else 4769 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; 4770 rm_wunlock(&sc->hn_vf_lock); 4771 HN_UNLOCK(sc); 4772 4773 return (0); 4774} 4775 4776static int 4777hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) 4778{ 4779 struct hn_softc *sc = arg1; 4780 int enabled = 0; 4781 4782 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 4783 enabled = 1; 4784 return (sysctl_handle_int(oidp, &enabled, 0, req)); 4785} 4786 4787static int 4788hn_check_iplen(const struct mbuf *m, int hoff) 4789{ 4790 const struct ip *ip; 4791 int len, iphlen, iplen; 4792 const struct tcphdr *th; 4793 int thoff; /* TCP data offset */ 4794 4795 len = hoff + sizeof(struct ip); 4796 4797 /* The packet must be at least the size of an IP header. */ 4798 if (m->m_pkthdr.len < len) 4799 return IPPROTO_DONE; 4800 4801 /* The fixed IP header must reside completely in the first mbuf. */ 4802 if (m->m_len < len) 4803 return IPPROTO_DONE; 4804 4805 ip = mtodo(m, hoff); 4806 4807 /* Bound check the packet's stated IP header length. */ 4808 iphlen = ip->ip_hl << 2; 4809 if (iphlen < sizeof(struct ip)) /* minimum header length */ 4810 return IPPROTO_DONE; 4811 4812 /* The full IP header must reside completely in the one mbuf. */ 4813 if (m->m_len < hoff + iphlen) 4814 return IPPROTO_DONE; 4815 4816 iplen = ntohs(ip->ip_len); 4817 4818 /* 4819 * Check that the amount of data in the buffers is as 4820 * at least much as the IP header would have us expect. 4821 */ 4822 if (m->m_pkthdr.len < hoff + iplen) 4823 return IPPROTO_DONE; 4824 4825 /* 4826 * Ignore IP fragments. 4827 */ 4828 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) 4829 return IPPROTO_DONE; 4830 4831 /* 4832 * The TCP/IP or UDP/IP header must be entirely contained within 4833 * the first fragment of a packet. 4834 */ 4835 switch (ip->ip_p) { 4836 case IPPROTO_TCP: 4837 if (iplen < iphlen + sizeof(struct tcphdr)) 4838 return IPPROTO_DONE; 4839 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) 4840 return IPPROTO_DONE; 4841 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); 4842 thoff = th->th_off << 2; 4843 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) 4844 return IPPROTO_DONE; 4845 if (m->m_len < hoff + iphlen + thoff) 4846 return IPPROTO_DONE; 4847 break; 4848 case IPPROTO_UDP: 4849 if (iplen < iphlen + sizeof(struct udphdr)) 4850 return IPPROTO_DONE; 4851 if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) 4852 return IPPROTO_DONE; 4853 break; 4854 default: 4855 if (iplen < iphlen) 4856 return IPPROTO_DONE; 4857 break; 4858 } 4859 return ip->ip_p; 4860} 4861 4862static void 4863hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) 4864{ 4865 const struct ether_header *eh; 4866 uint16_t etype; 4867 int hoff; 4868 4869 hoff = sizeof(*eh); 4870 /* Checked at the beginning of this function. */ 4871 KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); 4872 4873 eh = mtod(m_new, const struct ether_header *); 4874 etype = ntohs(eh->ether_type); 4875 if (etype == ETHERTYPE_VLAN) { 4876 const struct ether_vlan_header *evl; 4877 4878 hoff = sizeof(*evl); 4879 if (m_new->m_len < hoff) 4880 return; 4881 evl = mtod(m_new, const struct ether_vlan_header *); 4882 etype = ntohs(evl->evl_proto); 4883 } 4884 *l3proto = etype; 4885 4886 if (etype == ETHERTYPE_IP) 4887 *l4proto = hn_check_iplen(m_new, hoff); 4888 else 4889 *l4proto = IPPROTO_DONE; 4890} 4891 4892static int 4893hn_create_rx_data(struct hn_softc *sc, int ring_cnt) 4894{ 4895 struct sysctl_oid_list *child; 4896 struct sysctl_ctx_list *ctx; 4897 device_t dev = sc->hn_dev; 4898#if defined(INET) || defined(INET6) 4899#if __FreeBSD_version >= 1100095 4900 int lroent_cnt; 4901#endif 4902#endif 4903 int i; 4904 4905 /* 4906 * Create RXBUF for reception. 4907 * 4908 * NOTE: 4909 * - It is shared by all channels. 4910 * - A large enough buffer is allocated, certain version of NVSes 4911 * may further limit the usable space. 4912 */ 4913 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4914 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, 4915 BUS_DMA_WAITOK | BUS_DMA_ZERO); 4916 if (sc->hn_rxbuf == NULL) { 4917 device_printf(sc->hn_dev, "allocate rxbuf failed\n"); 4918 return (ENOMEM); 4919 } 4920 4921 sc->hn_rx_ring_cnt = ring_cnt; 4922 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; 4923 4924 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, 4925 M_DEVBUF, M_WAITOK | M_ZERO); 4926 4927#if defined(INET) || defined(INET6) 4928#if __FreeBSD_version >= 1100095 4929 lroent_cnt = hn_lro_entry_count; 4930 if (lroent_cnt < TCP_LRO_ENTRIES) 4931 lroent_cnt = TCP_LRO_ENTRIES; 4932 if (bootverbose) 4933 device_printf(dev, "LRO: entry count %d\n", lroent_cnt); 4934#endif 4935#endif /* INET || INET6 */ 4936 4937 ctx = device_get_sysctl_ctx(dev); 4938 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); 4939 4940 /* Create dev.hn.UNIT.rx sysctl tree */ 4941 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", 4942 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4943 4944 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 4945 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 4946 4947 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), 4948 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, 4949 &rxr->hn_br_dma, BUS_DMA_WAITOK); 4950 if (rxr->hn_br == NULL) { 4951 device_printf(dev, "allocate bufring failed\n"); 4952 return (ENOMEM); 4953 } 4954 4955 if (hn_trust_hosttcp) 4956 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; 4957 if (hn_trust_hostudp) 4958 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; 4959 if (hn_trust_hostip) 4960 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; 4961 rxr->hn_mbuf_hash = NDIS_HASH_ALL; 4962 rxr->hn_ifp = sc->hn_ifp; 4963 if (i < sc->hn_tx_ring_cnt) 4964 rxr->hn_txr = &sc->hn_tx_ring[i]; 4965 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; 4966 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); 4967 rxr->hn_rx_idx = i; 4968 rxr->hn_rxbuf = sc->hn_rxbuf; 4969 4970 /* 4971 * Initialize LRO. 4972 */ 4973#if defined(INET) || defined(INET6) 4974#if __FreeBSD_version >= 1100095 4975 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, 4976 hn_lro_mbufq_depth); 4977#else 4978 tcp_lro_init(&rxr->hn_lro); 4979 rxr->hn_lro.ifp = sc->hn_ifp; 4980#endif 4981#if __FreeBSD_version >= 1100099 4982 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; 4983 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; 4984#endif 4985#endif /* INET || INET6 */ 4986 4987 if (sc->hn_rx_sysctl_tree != NULL) { 4988 char name[16]; 4989 4990 /* 4991 * Create per RX ring sysctl tree: 4992 * dev.hn.UNIT.rx.RINGID 4993 */ 4994 snprintf(name, sizeof(name), "%d", i); 4995 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, 4996 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), 4997 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 4998 4999 if (rxr->hn_rx_sysctl_tree != NULL) { 5000 SYSCTL_ADD_ULONG(ctx, 5001 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5002 OID_AUTO, "packets", CTLFLAG_RW, 5003 &rxr->hn_pkts, "# of packets received"); 5004 SYSCTL_ADD_ULONG(ctx, 5005 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5006 OID_AUTO, "rss_pkts", CTLFLAG_RW, 5007 &rxr->hn_rss_pkts, 5008 "# of packets w/ RSS info received"); 5009 SYSCTL_ADD_INT(ctx, 5010 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), 5011 OID_AUTO, "pktbuf_len", CTLFLAG_RD, 5012 &rxr->hn_pktbuf_len, 0, 5013 "Temporary channel packet buffer length"); 5014 } 5015 } 5016 } 5017 5018 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", 5019 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5020 __offsetof(struct hn_rx_ring, hn_lro.lro_queued), 5021#if __FreeBSD_version < 1100095 5022 hn_rx_stat_int_sysctl, 5023#else 5024 hn_rx_stat_u64_sysctl, 5025#endif 5026 "LU", "LRO queued"); 5027 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", 5028 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5029 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), 5030#if __FreeBSD_version < 1100095 5031 hn_rx_stat_int_sysctl, 5032#else 5033 hn_rx_stat_u64_sysctl, 5034#endif 5035 "LU", "LRO flushed"); 5036 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", 5037 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5038 __offsetof(struct hn_rx_ring, hn_lro_tried), 5039 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); 5040#if __FreeBSD_version >= 1100099 5041 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", 5042 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5043 hn_lro_lenlim_sysctl, "IU", 5044 "Max # of data bytes to be aggregated by LRO"); 5045 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", 5046 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5047 hn_lro_ackcnt_sysctl, "I", 5048 "Max # of ACKs to be aggregated by LRO"); 5049#endif 5050 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", 5051 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, 5052 hn_trust_hcsum_sysctl, "I", 5053 "Trust tcp segement verification on host side, " 5054 "when csum info is missing"); 5055 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", 5056 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, 5057 hn_trust_hcsum_sysctl, "I", 5058 "Trust udp datagram verification on host side, " 5059 "when csum info is missing"); 5060 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", 5061 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, 5062 hn_trust_hcsum_sysctl, "I", 5063 "Trust ip packet verification on host side, " 5064 "when csum info is missing"); 5065 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", 5066 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5067 __offsetof(struct hn_rx_ring, hn_csum_ip), 5068 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); 5069 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", 5070 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5071 __offsetof(struct hn_rx_ring, hn_csum_tcp), 5072 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); 5073 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", 5074 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5075 __offsetof(struct hn_rx_ring, hn_csum_udp), 5076 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); 5077 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", 5078 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5079 __offsetof(struct hn_rx_ring, hn_csum_trusted), 5080 hn_rx_stat_ulong_sysctl, "LU", 5081 "# of packets that we trust host's csum verification"); 5082 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", 5083 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5084 __offsetof(struct hn_rx_ring, hn_small_pkts), 5085 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); 5086 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", 5087 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5088 __offsetof(struct hn_rx_ring, hn_ack_failed), 5089 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); 5090 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", 5091 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); 5092 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", 5093 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); 5094 5095 return (0); 5096} 5097 5098static void 5099hn_destroy_rx_data(struct hn_softc *sc) 5100{ 5101 int i; 5102 5103 if (sc->hn_rxbuf != NULL) { 5104 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) 5105 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); 5106 else 5107 device_printf(sc->hn_dev, "RXBUF is referenced\n"); 5108 sc->hn_rxbuf = NULL; 5109 } 5110 5111 if (sc->hn_rx_ring_cnt == 0) 5112 return; 5113 5114 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 5115 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 5116 5117 if (rxr->hn_br == NULL) 5118 continue; 5119 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { 5120 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); 5121 } else { 5122 device_printf(sc->hn_dev, 5123 "%dth channel bufring is referenced", i); 5124 } 5125 rxr->hn_br = NULL; 5126 5127#if defined(INET) || defined(INET6) 5128 tcp_lro_free(&rxr->hn_lro); 5129#endif 5130 free(rxr->hn_pktbuf, M_DEVBUF); 5131 } 5132 free(sc->hn_rx_ring, M_DEVBUF); 5133 sc->hn_rx_ring = NULL; 5134 5135 sc->hn_rx_ring_cnt = 0; 5136 sc->hn_rx_ring_inuse = 0; 5137} 5138 5139static int 5140hn_tx_ring_create(struct hn_softc *sc, int id) 5141{ 5142 struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; 5143 device_t dev = sc->hn_dev; 5144 bus_dma_tag_t parent_dtag; 5145 int error, i; 5146 5147 txr->hn_sc = sc; 5148 txr->hn_tx_idx = id; 5149 5150#ifndef HN_USE_TXDESC_BUFRING 5151 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); 5152#endif 5153 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); 5154 5155 txr->hn_txdesc_cnt = HN_TX_DESC_CNT; 5156 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, 5157 M_DEVBUF, M_WAITOK | M_ZERO); 5158#ifndef HN_USE_TXDESC_BUFRING 5159 SLIST_INIT(&txr->hn_txlist); 5160#else 5161 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, 5162 M_WAITOK, &txr->hn_tx_lock); 5163#endif 5164 5165 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { 5166 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( 5167 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); 5168 } else { 5169 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; 5170 } 5171 5172#ifdef HN_IFSTART_SUPPORT 5173 if (hn_use_if_start) { 5174 txr->hn_txeof = hn_start_txeof; 5175 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); 5176 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); 5177 } else 5178#endif 5179 { 5180 int br_depth; 5181 5182 txr->hn_txeof = hn_xmit_txeof; 5183 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); 5184 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); 5185 5186 br_depth = hn_get_txswq_depth(txr); 5187 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, 5188 M_WAITOK, &txr->hn_tx_lock); 5189 } 5190 5191 txr->hn_direct_tx_size = hn_direct_tx_size; 5192 5193 /* 5194 * Always schedule transmission instead of trying to do direct 5195 * transmission. This one gives the best performance so far. 5196 */ 5197 txr->hn_sched_tx = 1; 5198 5199 parent_dtag = bus_get_dma_tag(dev); 5200 5201 /* DMA tag for RNDIS packet messages. */ 5202 error = bus_dma_tag_create(parent_dtag, /* parent */ 5203 HN_RNDIS_PKT_ALIGN, /* alignment */ 5204 HN_RNDIS_PKT_BOUNDARY, /* boundary */ 5205 BUS_SPACE_MAXADDR, /* lowaddr */ 5206 BUS_SPACE_MAXADDR, /* highaddr */ 5207 NULL, NULL, /* filter, filterarg */ 5208 HN_RNDIS_PKT_LEN, /* maxsize */ 5209 1, /* nsegments */ 5210 HN_RNDIS_PKT_LEN, /* maxsegsize */ 5211 0, /* flags */ 5212 NULL, /* lockfunc */ 5213 NULL, /* lockfuncarg */ 5214 &txr->hn_tx_rndis_dtag); 5215 if (error) { 5216 device_printf(dev, "failed to create rndis dmatag\n"); 5217 return error; 5218 } 5219 5220 /* DMA tag for data. */ 5221 error = bus_dma_tag_create(parent_dtag, /* parent */ 5222 1, /* alignment */ 5223 HN_TX_DATA_BOUNDARY, /* boundary */ 5224 BUS_SPACE_MAXADDR, /* lowaddr */ 5225 BUS_SPACE_MAXADDR, /* highaddr */ 5226 NULL, NULL, /* filter, filterarg */ 5227 HN_TX_DATA_MAXSIZE, /* maxsize */ 5228 HN_TX_DATA_SEGCNT_MAX, /* nsegments */ 5229 HN_TX_DATA_SEGSIZE, /* maxsegsize */ 5230 0, /* flags */ 5231 NULL, /* lockfunc */ 5232 NULL, /* lockfuncarg */ 5233 &txr->hn_tx_data_dtag); 5234 if (error) { 5235 device_printf(dev, "failed to create data dmatag\n"); 5236 return error; 5237 } 5238 5239 for (i = 0; i < txr->hn_txdesc_cnt; ++i) { 5240 struct hn_txdesc *txd = &txr->hn_txdesc[i]; 5241 5242 txd->txr = txr; 5243 txd->chim_index = HN_NVS_CHIM_IDX_INVALID; 5244 STAILQ_INIT(&txd->agg_list); 5245 5246 /* 5247 * Allocate and load RNDIS packet message. 5248 */ 5249 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, 5250 (void **)&txd->rndis_pkt, 5251 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, 5252 &txd->rndis_pkt_dmap); 5253 if (error) { 5254 device_printf(dev, 5255 "failed to allocate rndis_packet_msg, %d\n", i); 5256 return error; 5257 } 5258 5259 error = bus_dmamap_load(txr->hn_tx_rndis_dtag, 5260 txd->rndis_pkt_dmap, 5261 txd->rndis_pkt, HN_RNDIS_PKT_LEN, 5262 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, 5263 BUS_DMA_NOWAIT); 5264 if (error) { 5265 device_printf(dev, 5266 "failed to load rndis_packet_msg, %d\n", i); 5267 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5268 txd->rndis_pkt, txd->rndis_pkt_dmap); 5269 return error; 5270 } 5271 5272 /* DMA map for TX data. */ 5273 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, 5274 &txd->data_dmap); 5275 if (error) { 5276 device_printf(dev, 5277 "failed to allocate tx data dmamap\n"); 5278 bus_dmamap_unload(txr->hn_tx_rndis_dtag, 5279 txd->rndis_pkt_dmap); 5280 bus_dmamem_free(txr->hn_tx_rndis_dtag, 5281 txd->rndis_pkt, txd->rndis_pkt_dmap); 5282 return error; 5283 } 5284 5285 /* All set, put it to list */ 5286 txd->flags |= HN_TXD_FLAG_ONLIST; 5287#ifndef HN_USE_TXDESC_BUFRING 5288 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); 5289#else 5290 buf_ring_enqueue(txr->hn_txdesc_br, txd); 5291#endif 5292 } 5293 txr->hn_txdesc_avail = txr->hn_txdesc_cnt; 5294 5295 if (sc->hn_tx_sysctl_tree != NULL) { 5296 struct sysctl_oid_list *child; 5297 struct sysctl_ctx_list *ctx; 5298 char name[16]; 5299 5300 /* 5301 * Create per TX ring sysctl tree: 5302 * dev.hn.UNIT.tx.RINGID 5303 */ 5304 ctx = device_get_sysctl_ctx(dev); 5305 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); 5306 5307 snprintf(name, sizeof(name), "%d", id); 5308 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, 5309 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5310 5311 if (txr->hn_tx_sysctl_tree != NULL) { 5312 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); 5313 5314#ifdef HN_DEBUG 5315 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", 5316 CTLFLAG_RD, &txr->hn_txdesc_avail, 0, 5317 "# of available TX descs"); 5318#endif 5319#ifdef HN_IFSTART_SUPPORT 5320 if (!hn_use_if_start) 5321#endif 5322 { 5323 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", 5324 CTLFLAG_RD, &txr->hn_oactive, 0, 5325 "over active"); 5326 } 5327 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", 5328 CTLFLAG_RW, &txr->hn_pkts, 5329 "# of packets transmitted"); 5330 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", 5331 CTLFLAG_RW, &txr->hn_sends, "# of sends"); 5332 } 5333 } 5334 5335 return 0; 5336} 5337 5338static void 5339hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) 5340{ 5341 struct hn_tx_ring *txr = txd->txr; 5342 5343 KASSERT(txd->m == NULL, ("still has mbuf installed")); 5344 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); 5345 5346 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); 5347 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, 5348 txd->rndis_pkt_dmap); 5349 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); 5350} 5351 5352static void 5353hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) 5354{ 5355 5356 KASSERT(txd->refs == 0 || txd->refs == 1, 5357 ("invalid txd refs %d", txd->refs)); 5358 5359 /* Aggregated txds will be freed by their aggregating txd. */ 5360 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { 5361 int freed; 5362 5363 freed = hn_txdesc_put(txr, txd); 5364 KASSERT(freed, ("can't free txdesc")); 5365 } 5366} 5367 5368static void 5369hn_tx_ring_destroy(struct hn_tx_ring *txr) 5370{ 5371 int i; 5372 5373 if (txr->hn_txdesc == NULL) 5374 return; 5375 5376 /* 5377 * NOTE: 5378 * Because the freeing of aggregated txds will be deferred 5379 * to the aggregating txd, two passes are used here: 5380 * - The first pass GCes any pending txds. This GC is necessary, 5381 * since if the channels are revoked, hypervisor will not 5382 * deliver send-done for all pending txds. 5383 * - The second pass frees the busdma stuffs, i.e. after all txds 5384 * were freed. 5385 */ 5386 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5387 hn_txdesc_gc(txr, &txr->hn_txdesc[i]); 5388 for (i = 0; i < txr->hn_txdesc_cnt; ++i) 5389 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); 5390 5391 if (txr->hn_tx_data_dtag != NULL) 5392 bus_dma_tag_destroy(txr->hn_tx_data_dtag); 5393 if (txr->hn_tx_rndis_dtag != NULL) 5394 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); 5395 5396#ifdef HN_USE_TXDESC_BUFRING 5397 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); 5398#endif 5399 5400 free(txr->hn_txdesc, M_DEVBUF); 5401 txr->hn_txdesc = NULL; 5402 5403 if (txr->hn_mbuf_br != NULL) 5404 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); 5405 5406#ifndef HN_USE_TXDESC_BUFRING 5407 mtx_destroy(&txr->hn_txlist_spin); 5408#endif 5409 mtx_destroy(&txr->hn_tx_lock); 5410} 5411 5412static int 5413hn_create_tx_data(struct hn_softc *sc, int ring_cnt) 5414{ 5415 struct sysctl_oid_list *child; 5416 struct sysctl_ctx_list *ctx; 5417 int i; 5418 5419 /* 5420 * Create TXBUF for chimney sending. 5421 * 5422 * NOTE: It is shared by all channels. 5423 */ 5424 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), 5425 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, 5426 BUS_DMA_WAITOK | BUS_DMA_ZERO); 5427 if (sc->hn_chim == NULL) { 5428 device_printf(sc->hn_dev, "allocate txbuf failed\n"); 5429 return (ENOMEM); 5430 } 5431 5432 sc->hn_tx_ring_cnt = ring_cnt; 5433 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 5434 5435 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, 5436 M_DEVBUF, M_WAITOK | M_ZERO); 5437 5438 ctx = device_get_sysctl_ctx(sc->hn_dev); 5439 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); 5440 5441 /* Create dev.hn.UNIT.tx sysctl tree */ 5442 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", 5443 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); 5444 5445 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 5446 int error; 5447 5448 error = hn_tx_ring_create(sc, i); 5449 if (error) 5450 return error; 5451 } 5452 5453 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", 5454 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5455 __offsetof(struct hn_tx_ring, hn_no_txdescs), 5456 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); 5457 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", 5458 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5459 __offsetof(struct hn_tx_ring, hn_send_failed), 5460 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); 5461 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", 5462 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5463 __offsetof(struct hn_tx_ring, hn_txdma_failed), 5464 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); 5465 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", 5466 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5467 __offsetof(struct hn_tx_ring, hn_flush_failed), 5468 hn_tx_stat_ulong_sysctl, "LU", 5469 "# of packet transmission aggregation flush failure"); 5470 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", 5471 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5472 __offsetof(struct hn_tx_ring, hn_tx_collapsed), 5473 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); 5474 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", 5475 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5476 __offsetof(struct hn_tx_ring, hn_tx_chimney), 5477 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); 5478 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", 5479 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5480 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), 5481 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); 5482 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", 5483 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, 5484 "# of total TX descs"); 5485 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", 5486 CTLFLAG_RD, &sc->hn_chim_szmax, 0, 5487 "Chimney send packet size upper boundary"); 5488 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", 5489 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, 5490 hn_chim_size_sysctl, "I", "Chimney send packet size limit"); 5491 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", 5492 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5493 __offsetof(struct hn_tx_ring, hn_direct_tx_size), 5494 hn_tx_conf_int_sysctl, "I", 5495 "Size of the packet for direct transmission"); 5496 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", 5497 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 5498 __offsetof(struct hn_tx_ring, hn_sched_tx), 5499 hn_tx_conf_int_sysctl, "I", 5500 "Always schedule transmission " 5501 "instead of doing direct transmission"); 5502 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", 5503 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); 5504 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", 5505 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); 5506 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", 5507 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, 5508 "Applied packet transmission aggregation size"); 5509 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", 5510 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5511 hn_txagg_pktmax_sysctl, "I", 5512 "Applied packet transmission aggregation packets"); 5513 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", 5514 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, 5515 hn_txagg_align_sysctl, "I", 5516 "Applied packet transmission aggregation alignment"); 5517 5518 return 0; 5519} 5520 5521static void 5522hn_set_chim_size(struct hn_softc *sc, int chim_size) 5523{ 5524 int i; 5525 5526 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5527 sc->hn_tx_ring[i].hn_chim_size = chim_size; 5528} 5529 5530static void 5531hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) 5532{ 5533 struct ifnet *ifp = sc->hn_ifp; 5534 u_int hw_tsomax; 5535 int tso_minlen; 5536 5537 HN_LOCK_ASSERT(sc); 5538 5539 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) 5540 return; 5541 5542 KASSERT(sc->hn_ndis_tso_sgmin >= 2, 5543 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); 5544 tso_minlen = sc->hn_ndis_tso_sgmin * mtu; 5545 5546 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && 5547 sc->hn_ndis_tso_szmax <= IP_MAXPACKET, 5548 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); 5549 5550 if (tso_maxlen < tso_minlen) 5551 tso_maxlen = tso_minlen; 5552 else if (tso_maxlen > IP_MAXPACKET) 5553 tso_maxlen = IP_MAXPACKET; 5554 if (tso_maxlen > sc->hn_ndis_tso_szmax) 5555 tso_maxlen = sc->hn_ndis_tso_szmax; 5556 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); 5557 5558 if (hn_xpnt_vf_isready(sc)) { 5559 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax) 5560 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax; 5561 } 5562 ifp->if_hw_tsomax = hw_tsomax; 5563 if (bootverbose) 5564 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); 5565} 5566 5567static void 5568hn_fixup_tx_data(struct hn_softc *sc) 5569{ 5570 uint64_t csum_assist; 5571 int i; 5572 5573 hn_set_chim_size(sc, sc->hn_chim_szmax); 5574 if (hn_tx_chimney_size > 0 && 5575 hn_tx_chimney_size < sc->hn_chim_szmax) 5576 hn_set_chim_size(sc, hn_tx_chimney_size); 5577 5578 csum_assist = 0; 5579 if (sc->hn_caps & HN_CAP_IPCS) 5580 csum_assist |= CSUM_IP; 5581 if (sc->hn_caps & HN_CAP_TCP4CS) 5582 csum_assist |= CSUM_IP_TCP; 5583 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) 5584 csum_assist |= CSUM_IP_UDP; 5585 if (sc->hn_caps & HN_CAP_TCP6CS) 5586 csum_assist |= CSUM_IP6_TCP; 5587 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) 5588 csum_assist |= CSUM_IP6_UDP; 5589 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5590 sc->hn_tx_ring[i].hn_csum_assist = csum_assist; 5591 5592 if (sc->hn_caps & HN_CAP_HASHVAL) { 5593 /* 5594 * Support HASHVAL pktinfo on TX path. 5595 */ 5596 if (bootverbose) 5597 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); 5598 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5599 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; 5600 } 5601} 5602 5603static void 5604hn_fixup_rx_data(struct hn_softc *sc) 5605{ 5606 5607 if (sc->hn_caps & HN_CAP_UDPHASH) { 5608 int i; 5609 5610 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) 5611 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; 5612 } 5613} 5614 5615static void 5616hn_destroy_tx_data(struct hn_softc *sc) 5617{ 5618 int i; 5619 5620 if (sc->hn_chim != NULL) { 5621 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { 5622 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); 5623 } else { 5624 device_printf(sc->hn_dev, 5625 "chimney sending buffer is referenced"); 5626 } 5627 sc->hn_chim = NULL; 5628 } 5629 5630 if (sc->hn_tx_ring_cnt == 0) 5631 return; 5632 5633 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) 5634 hn_tx_ring_destroy(&sc->hn_tx_ring[i]); 5635 5636 free(sc->hn_tx_ring, M_DEVBUF); 5637 sc->hn_tx_ring = NULL; 5638 5639 sc->hn_tx_ring_cnt = 0; 5640 sc->hn_tx_ring_inuse = 0; 5641} 5642 5643#ifdef HN_IFSTART_SUPPORT 5644 5645static void 5646hn_start_taskfunc(void *xtxr, int pending __unused) 5647{ 5648 struct hn_tx_ring *txr = xtxr; 5649 5650 mtx_lock(&txr->hn_tx_lock); 5651 hn_start_locked(txr, 0); 5652 mtx_unlock(&txr->hn_tx_lock); 5653} 5654 5655static int 5656hn_start_locked(struct hn_tx_ring *txr, int len) 5657{ 5658 struct hn_softc *sc = txr->hn_sc; 5659 struct ifnet *ifp = sc->hn_ifp; 5660 int sched = 0; 5661 5662 KASSERT(hn_use_if_start, 5663 ("hn_start_locked is called, when if_start is disabled")); 5664 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5665 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5666 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5667 5668 if (__predict_false(txr->hn_suspended)) 5669 return (0); 5670 5671 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != 5672 IFF_DRV_RUNNING) 5673 return (0); 5674 5675 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { 5676 struct hn_txdesc *txd; 5677 struct mbuf *m_head; 5678 int error; 5679 5680 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); 5681 if (m_head == NULL) 5682 break; 5683 5684 if (len > 0 && m_head->m_pkthdr.len > len) { 5685 /* 5686 * This sending could be time consuming; let callers 5687 * dispatch this packet sending (and sending of any 5688 * following up packets) to tx taskqueue. 5689 */ 5690 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5691 sched = 1; 5692 break; 5693 } 5694 5695#if defined(INET6) || defined(INET) 5696 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { 5697 m_head = hn_tso_fixup(m_head); 5698 if (__predict_false(m_head == NULL)) { 5699 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5700 continue; 5701 } 5702 } else if (m_head->m_pkthdr.csum_flags & 5703 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 5704 m_head = hn_set_hlen(m_head); 5705 if (__predict_false(m_head == NULL)) { 5706 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5707 continue; 5708 } 5709 } 5710#endif 5711 5712 txd = hn_txdesc_get(txr); 5713 if (txd == NULL) { 5714 txr->hn_no_txdescs++; 5715 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5716 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5717 break; 5718 } 5719 5720 error = hn_encap(ifp, txr, txd, &m_head); 5721 if (error) { 5722 /* Both txd and m_head are freed */ 5723 KASSERT(txr->hn_agg_txd == NULL, 5724 ("encap failed w/ pending aggregating txdesc")); 5725 continue; 5726 } 5727 5728 if (txr->hn_agg_pktleft == 0) { 5729 if (txr->hn_agg_txd != NULL) { 5730 KASSERT(m_head == NULL, 5731 ("pending mbuf for aggregating txdesc")); 5732 error = hn_flush_txagg(ifp, txr); 5733 if (__predict_false(error)) { 5734 atomic_set_int(&ifp->if_drv_flags, 5735 IFF_DRV_OACTIVE); 5736 break; 5737 } 5738 } else { 5739 KASSERT(m_head != NULL, ("mbuf was freed")); 5740 error = hn_txpkt(ifp, txr, txd); 5741 if (__predict_false(error)) { 5742 /* txd is freed, but m_head is not */ 5743 IFQ_DRV_PREPEND(&ifp->if_snd, m_head); 5744 atomic_set_int(&ifp->if_drv_flags, 5745 IFF_DRV_OACTIVE); 5746 break; 5747 } 5748 } 5749 } 5750#ifdef INVARIANTS 5751 else { 5752 KASSERT(txr->hn_agg_txd != NULL, 5753 ("no aggregating txdesc")); 5754 KASSERT(m_head == NULL, 5755 ("pending mbuf for aggregating txdesc")); 5756 } 5757#endif 5758 } 5759 5760 /* Flush pending aggerated transmission. */ 5761 if (txr->hn_agg_txd != NULL) 5762 hn_flush_txagg(ifp, txr); 5763 return (sched); 5764} 5765 5766static void 5767hn_start(struct ifnet *ifp) 5768{ 5769 struct hn_softc *sc = ifp->if_softc; 5770 struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; 5771 5772 if (txr->hn_sched_tx) 5773 goto do_sched; 5774 5775 if (mtx_trylock(&txr->hn_tx_lock)) { 5776 int sched; 5777 5778 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5779 mtx_unlock(&txr->hn_tx_lock); 5780 if (!sched) 5781 return; 5782 } 5783do_sched: 5784 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 5785} 5786 5787static void 5788hn_start_txeof_taskfunc(void *xtxr, int pending __unused) 5789{ 5790 struct hn_tx_ring *txr = xtxr; 5791 5792 mtx_lock(&txr->hn_tx_lock); 5793 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); 5794 hn_start_locked(txr, 0); 5795 mtx_unlock(&txr->hn_tx_lock); 5796} 5797 5798static void 5799hn_start_txeof(struct hn_tx_ring *txr) 5800{ 5801 struct hn_softc *sc = txr->hn_sc; 5802 struct ifnet *ifp = sc->hn_ifp; 5803 5804 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); 5805 5806 if (txr->hn_sched_tx) 5807 goto do_sched; 5808 5809 if (mtx_trylock(&txr->hn_tx_lock)) { 5810 int sched; 5811 5812 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5813 sched = hn_start_locked(txr, txr->hn_direct_tx_size); 5814 mtx_unlock(&txr->hn_tx_lock); 5815 if (sched) { 5816 taskqueue_enqueue(txr->hn_tx_taskq, 5817 &txr->hn_tx_task); 5818 } 5819 } else { 5820do_sched: 5821 /* 5822 * Release the OACTIVE earlier, with the hope, that 5823 * others could catch up. The task will clear the 5824 * flag again with the hn_tx_lock to avoid possible 5825 * races. 5826 */ 5827 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); 5828 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 5829 } 5830} 5831 5832#endif /* HN_IFSTART_SUPPORT */ 5833 5834static int 5835hn_xmit(struct hn_tx_ring *txr, int len) 5836{ 5837 struct hn_softc *sc = txr->hn_sc; 5838 struct ifnet *ifp = sc->hn_ifp; 5839 struct mbuf *m_head; 5840 int sched = 0; 5841 5842 mtx_assert(&txr->hn_tx_lock, MA_OWNED); 5843#ifdef HN_IFSTART_SUPPORT 5844 KASSERT(hn_use_if_start == 0, 5845 ("hn_xmit is called, when if_start is enabled")); 5846#endif 5847 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); 5848 5849 if (__predict_false(txr->hn_suspended)) 5850 return (0); 5851 5852 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) 5853 return (0); 5854 5855 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { 5856 struct hn_txdesc *txd; 5857 int error; 5858 5859 if (len > 0 && m_head->m_pkthdr.len > len) { 5860 /* 5861 * This sending could be time consuming; let callers 5862 * dispatch this packet sending (and sending of any 5863 * following up packets) to tx taskqueue. 5864 */ 5865 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5866 sched = 1; 5867 break; 5868 } 5869 5870 txd = hn_txdesc_get(txr); 5871 if (txd == NULL) { 5872 txr->hn_no_txdescs++; 5873 drbr_putback(ifp, txr->hn_mbuf_br, m_head); 5874 txr->hn_oactive = 1; 5875 break; 5876 } 5877 5878 error = hn_encap(ifp, txr, txd, &m_head); 5879 if (error) { 5880 /* Both txd and m_head are freed; discard */ 5881 KASSERT(txr->hn_agg_txd == NULL, 5882 ("encap failed w/ pending aggregating txdesc")); 5883 drbr_advance(ifp, txr->hn_mbuf_br); 5884 continue; 5885 } 5886 5887 if (txr->hn_agg_pktleft == 0) { 5888 if (txr->hn_agg_txd != NULL) { 5889 KASSERT(m_head == NULL, 5890 ("pending mbuf for aggregating txdesc")); 5891 error = hn_flush_txagg(ifp, txr); 5892 if (__predict_false(error)) { 5893 txr->hn_oactive = 1; 5894 break; 5895 } 5896 } else { 5897 KASSERT(m_head != NULL, ("mbuf was freed")); 5898 error = hn_txpkt(ifp, txr, txd); 5899 if (__predict_false(error)) { 5900 /* txd is freed, but m_head is not */ 5901 drbr_putback(ifp, txr->hn_mbuf_br, 5902 m_head); 5903 txr->hn_oactive = 1; 5904 break; 5905 } 5906 } 5907 } 5908#ifdef INVARIANTS 5909 else { 5910 KASSERT(txr->hn_agg_txd != NULL, 5911 ("no aggregating txdesc")); 5912 KASSERT(m_head == NULL, 5913 ("pending mbuf for aggregating txdesc")); 5914 } 5915#endif 5916 5917 /* Sent */ 5918 drbr_advance(ifp, txr->hn_mbuf_br); 5919 } 5920 5921 /* Flush pending aggerated transmission. */ 5922 if (txr->hn_agg_txd != NULL) 5923 hn_flush_txagg(ifp, txr); 5924 return (sched); 5925} 5926 5927static int 5928hn_transmit(struct ifnet *ifp, struct mbuf *m) 5929{ 5930 struct hn_softc *sc = ifp->if_softc; 5931 struct hn_tx_ring *txr; 5932 int error, idx = 0; 5933 5934 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { 5935 struct rm_priotracker pt; 5936 5937 rm_rlock(&sc->hn_vf_lock, &pt); 5938 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 5939 struct mbuf *m_bpf = NULL; 5940 int obytes, omcast; 5941 5942 obytes = m->m_pkthdr.len; 5943 if (m->m_flags & M_MCAST) 5944 omcast = 1; 5945 5946 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { 5947 if (bpf_peers_present(ifp->if_bpf)) { 5948 m_bpf = m_copypacket(m, M_NOWAIT); 5949 if (m_bpf == NULL) { 5950 /* 5951 * Failed to grab a shallow 5952 * copy; tap now. 5953 */ 5954 ETHER_BPF_MTAP(ifp, m); 5955 } 5956 } 5957 } else { 5958 ETHER_BPF_MTAP(ifp, m); 5959 } 5960 5961 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m); 5962 rm_runlock(&sc->hn_vf_lock, &pt); 5963 5964 if (m_bpf != NULL) { 5965 if (!error) 5966 ETHER_BPF_MTAP(ifp, m_bpf); 5967 m_freem(m_bpf); 5968 } 5969 5970 if (error == ENOBUFS) { 5971 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 5972 } else if (error) { 5973 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5974 } else { 5975 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); 5976 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); 5977 if (omcast) { 5978 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 5979 omcast); 5980 } 5981 } 5982 return (error); 5983 } 5984 rm_runlock(&sc->hn_vf_lock, &pt); 5985 } 5986 5987#if defined(INET6) || defined(INET) 5988 /* 5989 * Perform TSO packet header fixup or get l2/l3 header length now, 5990 * since packet headers should be cache-hot. 5991 */ 5992 if (m->m_pkthdr.csum_flags & CSUM_TSO) { 5993 m = hn_tso_fixup(m); 5994 if (__predict_false(m == NULL)) { 5995 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 5996 return EIO; 5997 } 5998 } else if (m->m_pkthdr.csum_flags & 5999 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { 6000 m = hn_set_hlen(m); 6001 if (__predict_false(m == NULL)) { 6002 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); 6003 return EIO; 6004 } 6005 } 6006#endif 6007 6008 /* 6009 * Select the TX ring based on flowid 6010 */ 6011 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 6012#if defined(INET6) || defined(INET) 6013 int tcpsyn = 0; 6014 6015 if (m->m_pkthdr.len < 128 && 6016 (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) && 6017 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { 6018 m = hn_check_tcpsyn(m, &tcpsyn); 6019 if (__predict_false(m == NULL)) { 6020 if_inc_counter(ifp, 6021 IFCOUNTER_OERRORS, 1); 6022 return (EIO); 6023 } 6024 } 6025#else 6026 const int tcpsyn = 0; 6027#endif 6028 if (tcpsyn) 6029 idx = 0; 6030 else 6031 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; 6032 } 6033 txr = &sc->hn_tx_ring[idx]; 6034 6035 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); 6036 if (error) { 6037 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); 6038 return error; 6039 } 6040 6041 if (txr->hn_oactive) 6042 return 0; 6043 6044 if (txr->hn_sched_tx) 6045 goto do_sched; 6046 6047 if (mtx_trylock(&txr->hn_tx_lock)) { 6048 int sched; 6049 6050 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6051 mtx_unlock(&txr->hn_tx_lock); 6052 if (!sched) 6053 return 0; 6054 } 6055do_sched: 6056 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); 6057 return 0; 6058} 6059 6060static void 6061hn_tx_ring_qflush(struct hn_tx_ring *txr) 6062{ 6063 struct mbuf *m; 6064 6065 mtx_lock(&txr->hn_tx_lock); 6066 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) 6067 m_freem(m); 6068 mtx_unlock(&txr->hn_tx_lock); 6069} 6070 6071static void 6072hn_xmit_qflush(struct ifnet *ifp) 6073{ 6074 struct hn_softc *sc = ifp->if_softc; 6075 struct rm_priotracker pt; 6076 int i; 6077 6078 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) 6079 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6080 if_qflush(ifp); 6081 6082 rm_rlock(&sc->hn_vf_lock, &pt); 6083 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) 6084 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp); 6085 rm_runlock(&sc->hn_vf_lock, &pt); 6086} 6087 6088static void 6089hn_xmit_txeof(struct hn_tx_ring *txr) 6090{ 6091 6092 if (txr->hn_sched_tx) 6093 goto do_sched; 6094 6095 if (mtx_trylock(&txr->hn_tx_lock)) { 6096 int sched; 6097 6098 txr->hn_oactive = 0; 6099 sched = hn_xmit(txr, txr->hn_direct_tx_size); 6100 mtx_unlock(&txr->hn_tx_lock); 6101 if (sched) { 6102 taskqueue_enqueue(txr->hn_tx_taskq, 6103 &txr->hn_tx_task); 6104 } 6105 } else { 6106do_sched: 6107 /* 6108 * Release the oactive earlier, with the hope, that 6109 * others could catch up. The task will clear the 6110 * oactive again with the hn_tx_lock to avoid possible 6111 * races. 6112 */ 6113 txr->hn_oactive = 0; 6114 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6115 } 6116} 6117 6118static void 6119hn_xmit_taskfunc(void *xtxr, int pending __unused) 6120{ 6121 struct hn_tx_ring *txr = xtxr; 6122 6123 mtx_lock(&txr->hn_tx_lock); 6124 hn_xmit(txr, 0); 6125 mtx_unlock(&txr->hn_tx_lock); 6126} 6127 6128static void 6129hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) 6130{ 6131 struct hn_tx_ring *txr = xtxr; 6132 6133 mtx_lock(&txr->hn_tx_lock); 6134 txr->hn_oactive = 0; 6135 hn_xmit(txr, 0); 6136 mtx_unlock(&txr->hn_tx_lock); 6137} 6138 6139static int 6140hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) 6141{ 6142 struct vmbus_chan_br cbr; 6143 struct hn_rx_ring *rxr; 6144 struct hn_tx_ring *txr = NULL; 6145 int idx, error; 6146 6147 idx = vmbus_chan_subidx(chan); 6148 6149 /* 6150 * Link this channel to RX/TX ring. 6151 */ 6152 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6153 ("invalid channel index %d, should > 0 && < %d", 6154 idx, sc->hn_rx_ring_inuse)); 6155 rxr = &sc->hn_rx_ring[idx]; 6156 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, 6157 ("RX ring %d already attached", idx)); 6158 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; 6159 rxr->hn_chan = chan; 6160 6161 if (bootverbose) { 6162 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", 6163 idx, vmbus_chan_id(chan)); 6164 } 6165 6166 if (idx < sc->hn_tx_ring_inuse) { 6167 txr = &sc->hn_tx_ring[idx]; 6168 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, 6169 ("TX ring %d already attached", idx)); 6170 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; 6171 6172 txr->hn_chan = chan; 6173 if (bootverbose) { 6174 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", 6175 idx, vmbus_chan_id(chan)); 6176 } 6177 } 6178 6179 /* Bind this channel to a proper CPU. */ 6180 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); 6181 6182 /* 6183 * Open this channel 6184 */ 6185 cbr.cbr = rxr->hn_br; 6186 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; 6187 cbr.cbr_txsz = HN_TXBR_SIZE; 6188 cbr.cbr_rxsz = HN_RXBR_SIZE; 6189 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); 6190 if (error) { 6191 if (error == EISCONN) { 6192 if_printf(sc->hn_ifp, "bufring is connected after " 6193 "chan%u open failure\n", vmbus_chan_id(chan)); 6194 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6195 } else { 6196 if_printf(sc->hn_ifp, "open chan%u failed: %d\n", 6197 vmbus_chan_id(chan), error); 6198 } 6199 } 6200 return (error); 6201} 6202 6203static void 6204hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) 6205{ 6206 struct hn_rx_ring *rxr; 6207 int idx, error; 6208 6209 idx = vmbus_chan_subidx(chan); 6210 6211 /* 6212 * Link this channel to RX/TX ring. 6213 */ 6214 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, 6215 ("invalid channel index %d, should > 0 && < %d", 6216 idx, sc->hn_rx_ring_inuse)); 6217 rxr = &sc->hn_rx_ring[idx]; 6218 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), 6219 ("RX ring %d is not attached", idx)); 6220 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; 6221 6222 if (idx < sc->hn_tx_ring_inuse) { 6223 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; 6224 6225 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), 6226 ("TX ring %d is not attached attached", idx)); 6227 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; 6228 } 6229 6230 /* 6231 * Close this channel. 6232 * 6233 * NOTE: 6234 * Channel closing does _not_ destroy the target channel. 6235 */ 6236 error = vmbus_chan_close_direct(chan); 6237 if (error == EISCONN) { 6238 if_printf(sc->hn_ifp, "chan%u bufring is connected " 6239 "after being closed\n", vmbus_chan_id(chan)); 6240 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; 6241 } else if (error) { 6242 if_printf(sc->hn_ifp, "chan%u close failed: %d\n", 6243 vmbus_chan_id(chan), error); 6244 } 6245} 6246 6247static int 6248hn_attach_subchans(struct hn_softc *sc) 6249{ 6250 struct vmbus_channel **subchans; 6251 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6252 int i, error = 0; 6253 6254 KASSERT(subchan_cnt > 0, ("no sub-channels")); 6255 6256 /* Attach the sub-channels. */ 6257 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6258 for (i = 0; i < subchan_cnt; ++i) { 6259 int error1; 6260 6261 error1 = hn_chan_attach(sc, subchans[i]); 6262 if (error1) { 6263 error = error1; 6264 /* Move on; all channels will be detached later. */ 6265 } 6266 } 6267 vmbus_subchan_rel(subchans, subchan_cnt); 6268 6269 if (error) { 6270 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); 6271 } else { 6272 if (bootverbose) { 6273 if_printf(sc->hn_ifp, "%d sub-channels attached\n", 6274 subchan_cnt); 6275 } 6276 } 6277 return (error); 6278} 6279 6280static void 6281hn_detach_allchans(struct hn_softc *sc) 6282{ 6283 struct vmbus_channel **subchans; 6284 int subchan_cnt = sc->hn_rx_ring_inuse - 1; 6285 int i; 6286 6287 if (subchan_cnt == 0) 6288 goto back; 6289 6290 /* Detach the sub-channels. */ 6291 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); 6292 for (i = 0; i < subchan_cnt; ++i) 6293 hn_chan_detach(sc, subchans[i]); 6294 vmbus_subchan_rel(subchans, subchan_cnt); 6295 6296back: 6297 /* 6298 * Detach the primary channel, _after_ all sub-channels 6299 * are detached. 6300 */ 6301 hn_chan_detach(sc, sc->hn_prichan); 6302 6303 /* Wait for sub-channels to be destroyed, if any. */ 6304 vmbus_subchan_drain(sc->hn_prichan); 6305 6306#ifdef INVARIANTS 6307 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6308 KASSERT((sc->hn_rx_ring[i].hn_rx_flags & 6309 HN_RX_FLAG_ATTACHED) == 0, 6310 ("%dth RX ring is still attached", i)); 6311 } 6312 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { 6313 KASSERT((sc->hn_tx_ring[i].hn_tx_flags & 6314 HN_TX_FLAG_ATTACHED) == 0, 6315 ("%dth TX ring is still attached", i)); 6316 } 6317#endif 6318} 6319 6320static int 6321hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) 6322{ 6323 struct vmbus_channel **subchans; 6324 int nchan, rxr_cnt, error; 6325 6326 nchan = *nsubch + 1; 6327 if (nchan == 1) { 6328 /* 6329 * Multiple RX/TX rings are not requested. 6330 */ 6331 *nsubch = 0; 6332 return (0); 6333 } 6334 6335 /* 6336 * Query RSS capabilities, e.g. # of RX rings, and # of indirect 6337 * table entries. 6338 */ 6339 error = hn_rndis_query_rsscaps(sc, &rxr_cnt); 6340 if (error) { 6341 /* No RSS; this is benign. */ 6342 *nsubch = 0; 6343 return (0); 6344 } 6345 if (bootverbose) { 6346 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", 6347 rxr_cnt, nchan); 6348 } 6349 6350 if (nchan > rxr_cnt) 6351 nchan = rxr_cnt; 6352 if (nchan == 1) { 6353 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); 6354 *nsubch = 0; 6355 return (0); 6356 } 6357 6358 /* 6359 * Allocate sub-channels from NVS. 6360 */ 6361 *nsubch = nchan - 1; 6362 error = hn_nvs_alloc_subchans(sc, nsubch); 6363 if (error || *nsubch == 0) { 6364 /* Failed to allocate sub-channels. */ 6365 *nsubch = 0; 6366 return (0); 6367 } 6368 6369 /* 6370 * Wait for all sub-channels to become ready before moving on. 6371 */ 6372 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); 6373 vmbus_subchan_rel(subchans, *nsubch); 6374 return (0); 6375} 6376 6377static bool 6378hn_synth_attachable(const struct hn_softc *sc) 6379{ 6380 int i; 6381 6382 if (sc->hn_flags & HN_FLAG_ERRORS) 6383 return (false); 6384 6385 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { 6386 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; 6387 6388 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) 6389 return (false); 6390 } 6391 return (true); 6392} 6393 6394/* 6395 * Make sure that the RX filter is zero after the successful 6396 * RNDIS initialization. 6397 * 6398 * NOTE: 6399 * Under certain conditions on certain versions of Hyper-V, 6400 * the RNDIS rxfilter is _not_ zero on the hypervisor side 6401 * after the successful RNDIS initialization, which breaks 6402 * the assumption of any following code (well, it breaks the 6403 * RNDIS API contract actually). Clear the RNDIS rxfilter 6404 * explicitly, drain packets sneaking through, and drain the 6405 * interrupt taskqueues scheduled due to the stealth packets. 6406 */ 6407static void 6408hn_rndis_init_fixat(struct hn_softc *sc, int nchan) 6409{ 6410 6411 hn_disable_rx(sc); 6412 hn_drain_rxtx(sc, nchan); 6413} 6414 6415static int 6416hn_synth_attach(struct hn_softc *sc, int mtu) 6417{ 6418#define ATTACHED_NVS 0x0002 6419#define ATTACHED_RNDIS 0x0004 6420 6421 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; 6422 int error, nsubch, nchan = 1, i, rndis_inited; 6423 uint32_t old_caps, attached = 0; 6424 6425 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, 6426 ("synthetic parts were attached")); 6427 6428 if (!hn_synth_attachable(sc)) 6429 return (ENXIO); 6430 6431 /* Save capabilities for later verification. */ 6432 old_caps = sc->hn_caps; 6433 sc->hn_caps = 0; 6434 6435 /* Clear RSS stuffs. */ 6436 sc->hn_rss_ind_size = 0; 6437 sc->hn_rss_hash = 0; 6438 sc->hn_rss_hcap = 0; 6439 6440 /* 6441 * Attach the primary channel _before_ attaching NVS and RNDIS. 6442 */ 6443 error = hn_chan_attach(sc, sc->hn_prichan); 6444 if (error) 6445 goto failed; 6446 6447 /* 6448 * Attach NVS. 6449 */ 6450 error = hn_nvs_attach(sc, mtu); 6451 if (error) 6452 goto failed; 6453 attached |= ATTACHED_NVS; 6454 6455 /* 6456 * Attach RNDIS _after_ NVS is attached. 6457 */ 6458 error = hn_rndis_attach(sc, mtu, &rndis_inited); 6459 if (rndis_inited) 6460 attached |= ATTACHED_RNDIS; 6461 if (error) 6462 goto failed; 6463 6464 /* 6465 * Make sure capabilities are not changed. 6466 */ 6467 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { 6468 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", 6469 old_caps, sc->hn_caps); 6470 error = ENXIO; 6471 goto failed; 6472 } 6473 6474 /* 6475 * Allocate sub-channels for multi-TX/RX rings. 6476 * 6477 * NOTE: 6478 * The # of RX rings that can be used is equivalent to the # of 6479 * channels to be requested. 6480 */ 6481 nsubch = sc->hn_rx_ring_cnt - 1; 6482 error = hn_synth_alloc_subchans(sc, &nsubch); 6483 if (error) 6484 goto failed; 6485 /* NOTE: _Full_ synthetic parts detach is required now. */ 6486 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; 6487 6488 /* 6489 * Set the # of TX/RX rings that could be used according to 6490 * the # of channels that NVS offered. 6491 */ 6492 nchan = nsubch + 1; 6493 hn_set_ring_inuse(sc, nchan); 6494 if (nchan == 1) { 6495 /* Only the primary channel can be used; done */ 6496 goto back; 6497 } 6498 6499 /* 6500 * Attach the sub-channels. 6501 * 6502 * NOTE: hn_set_ring_inuse() _must_ have been called. 6503 */ 6504 error = hn_attach_subchans(sc); 6505 if (error) 6506 goto failed; 6507 6508 /* 6509 * Configure RSS key and indirect table _after_ all sub-channels 6510 * are attached. 6511 */ 6512 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { 6513 /* 6514 * RSS key is not set yet; set it to the default RSS key. 6515 */ 6516 if (bootverbose) 6517 if_printf(sc->hn_ifp, "setup default RSS key\n"); 6518 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); 6519 sc->hn_flags |= HN_FLAG_HAS_RSSKEY; 6520 } 6521 6522 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { 6523 /* 6524 * RSS indirect table is not set yet; set it up in round- 6525 * robin fashion. 6526 */ 6527 if (bootverbose) { 6528 if_printf(sc->hn_ifp, "setup default RSS indirect " 6529 "table\n"); 6530 } 6531 for (i = 0; i < NDIS_HASH_INDCNT; ++i) 6532 rss->rss_ind[i] = i % nchan; 6533 sc->hn_flags |= HN_FLAG_HAS_RSSIND; 6534 } else { 6535 /* 6536 * # of usable channels may be changed, so we have to 6537 * make sure that all entries in RSS indirect table 6538 * are valid. 6539 * 6540 * NOTE: hn_set_ring_inuse() _must_ have been called. 6541 */ 6542 hn_rss_ind_fixup(sc); 6543 } 6544 6545 sc->hn_rss_hash = sc->hn_rss_hcap; 6546 if ((sc->hn_flags & HN_FLAG_RXVF) || 6547 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { 6548 /* NOTE: Don't reconfigure RSS; will do immediately. */ 6549 hn_vf_rss_fixup(sc, false); 6550 } 6551 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); 6552 if (error) 6553 goto failed; 6554back: 6555 /* 6556 * Fixup transmission aggregation setup. 6557 */ 6558 hn_set_txagg(sc); 6559 hn_rndis_init_fixat(sc, nchan); 6560 return (0); 6561 6562failed: 6563 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { 6564 hn_rndis_init_fixat(sc, nchan); 6565 hn_synth_detach(sc); 6566 } else { 6567 if (attached & ATTACHED_RNDIS) { 6568 hn_rndis_init_fixat(sc, nchan); 6569 hn_rndis_detach(sc); 6570 } 6571 if (attached & ATTACHED_NVS) 6572 hn_nvs_detach(sc); 6573 hn_chan_detach(sc, sc->hn_prichan); 6574 /* Restore old capabilities. */ 6575 sc->hn_caps = old_caps; 6576 } 6577 return (error); 6578 6579#undef ATTACHED_RNDIS 6580#undef ATTACHED_NVS 6581} 6582 6583/* 6584 * NOTE: 6585 * The interface must have been suspended though hn_suspend(), before 6586 * this function get called. 6587 */ 6588static void 6589hn_synth_detach(struct hn_softc *sc) 6590{ 6591 6592 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, 6593 ("synthetic parts were not attached")); 6594 6595 /* Detach the RNDIS first. */ 6596 hn_rndis_detach(sc); 6597 6598 /* Detach NVS. */ 6599 hn_nvs_detach(sc); 6600 6601 /* Detach all of the channels. */ 6602 hn_detach_allchans(sc); 6603 6604 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; 6605} 6606 6607static void 6608hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) 6609{ 6610 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, 6611 ("invalid ring count %d", ring_cnt)); 6612 6613 if (sc->hn_tx_ring_cnt > ring_cnt) 6614 sc->hn_tx_ring_inuse = ring_cnt; 6615 else 6616 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; 6617 sc->hn_rx_ring_inuse = ring_cnt; 6618 6619 if (bootverbose) { 6620 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", 6621 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); 6622 } 6623} 6624 6625static void 6626hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) 6627{ 6628 6629 /* 6630 * NOTE: 6631 * The TX bufring will not be drained by the hypervisor, 6632 * if the primary channel is revoked. 6633 */ 6634 while (!vmbus_chan_rx_empty(chan) || 6635 (!vmbus_chan_is_revoked(sc->hn_prichan) && 6636 !vmbus_chan_tx_empty(chan))) 6637 pause("waitch", 1); 6638 vmbus_chan_intr_drain(chan); 6639} 6640 6641static void 6642hn_disable_rx(struct hn_softc *sc) 6643{ 6644 6645 /* 6646 * Disable RX by clearing RX filter forcefully. 6647 */ 6648 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; 6649 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ 6650 6651 /* 6652 * Give RNDIS enough time to flush all pending data packets. 6653 */ 6654 pause("waitrx", (200 * hz) / 1000); 6655} 6656 6657/* 6658 * NOTE: 6659 * RX/TX _must_ have been suspended/disabled, before this function 6660 * is called. 6661 */ 6662static void 6663hn_drain_rxtx(struct hn_softc *sc, int nchan) 6664{ 6665 struct vmbus_channel **subch = NULL; 6666 int nsubch; 6667 6668 /* 6669 * Drain RX/TX bufrings and interrupts. 6670 */ 6671 nsubch = nchan - 1; 6672 if (nsubch > 0) 6673 subch = vmbus_subchan_get(sc->hn_prichan, nsubch); 6674 6675 if (subch != NULL) { 6676 int i; 6677 6678 for (i = 0; i < nsubch; ++i) 6679 hn_chan_drain(sc, subch[i]); 6680 } 6681 hn_chan_drain(sc, sc->hn_prichan); 6682 6683 if (subch != NULL) 6684 vmbus_subchan_rel(subch, nsubch); 6685} 6686 6687static void 6688hn_suspend_data(struct hn_softc *sc) 6689{ 6690 struct hn_tx_ring *txr; 6691 int i; 6692 6693 HN_LOCK_ASSERT(sc); 6694 6695 /* 6696 * Suspend TX. 6697 */ 6698 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6699 txr = &sc->hn_tx_ring[i]; 6700 6701 mtx_lock(&txr->hn_tx_lock); 6702 txr->hn_suspended = 1; 6703 mtx_unlock(&txr->hn_tx_lock); 6704 /* No one is able send more packets now. */ 6705 6706 /* 6707 * Wait for all pending sends to finish. 6708 * 6709 * NOTE: 6710 * We will _not_ receive all pending send-done, if the 6711 * primary channel is revoked. 6712 */ 6713 while (hn_tx_ring_pending(txr) && 6714 !vmbus_chan_is_revoked(sc->hn_prichan)) 6715 pause("hnwtx", 1 /* 1 tick */); 6716 } 6717 6718 /* 6719 * Disable RX. 6720 */ 6721 hn_disable_rx(sc); 6722 6723 /* 6724 * Drain RX/TX. 6725 */ 6726 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); 6727 6728 /* 6729 * Drain any pending TX tasks. 6730 * 6731 * NOTE: 6732 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX 6733 * tasks will have to be drained _after_ the above hn_drain_rxtx(). 6734 */ 6735 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6736 txr = &sc->hn_tx_ring[i]; 6737 6738 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); 6739 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); 6740 } 6741} 6742 6743static void 6744hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) 6745{ 6746 6747 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; 6748} 6749 6750static void 6751hn_suspend_mgmt(struct hn_softc *sc) 6752{ 6753 struct task task; 6754 6755 HN_LOCK_ASSERT(sc); 6756 6757 /* 6758 * Make sure that hn_mgmt_taskq0 can nolonger be accessed 6759 * through hn_mgmt_taskq. 6760 */ 6761 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); 6762 vmbus_chan_run_task(sc->hn_prichan, &task); 6763 6764 /* 6765 * Make sure that all pending management tasks are completed. 6766 */ 6767 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); 6768 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); 6769 taskqueue_drain_all(sc->hn_mgmt_taskq0); 6770} 6771 6772static void 6773hn_suspend(struct hn_softc *sc) 6774{ 6775 6776 /* Disable polling. */ 6777 hn_polling(sc, 0); 6778 6779 /* 6780 * If the non-transparent mode VF is activated, the synthetic 6781 * device is receiving packets, so the data path of the 6782 * synthetic device must be suspended. 6783 */ 6784 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6785 (sc->hn_flags & HN_FLAG_RXVF)) 6786 hn_suspend_data(sc); 6787 hn_suspend_mgmt(sc); 6788} 6789 6790static void 6791hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) 6792{ 6793 int i; 6794 6795 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, 6796 ("invalid TX ring count %d", tx_ring_cnt)); 6797 6798 for (i = 0; i < tx_ring_cnt; ++i) { 6799 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6800 6801 mtx_lock(&txr->hn_tx_lock); 6802 txr->hn_suspended = 0; 6803 mtx_unlock(&txr->hn_tx_lock); 6804 } 6805} 6806 6807static void 6808hn_resume_data(struct hn_softc *sc) 6809{ 6810 int i; 6811 6812 HN_LOCK_ASSERT(sc); 6813 6814 /* 6815 * Re-enable RX. 6816 */ 6817 hn_rxfilter_config(sc); 6818 6819 /* 6820 * Make sure to clear suspend status on "all" TX rings, 6821 * since hn_tx_ring_inuse can be changed after 6822 * hn_suspend_data(). 6823 */ 6824 hn_resume_tx(sc, sc->hn_tx_ring_cnt); 6825 6826#ifdef HN_IFSTART_SUPPORT 6827 if (!hn_use_if_start) 6828#endif 6829 { 6830 /* 6831 * Flush unused drbrs, since hn_tx_ring_inuse may be 6832 * reduced. 6833 */ 6834 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) 6835 hn_tx_ring_qflush(&sc->hn_tx_ring[i]); 6836 } 6837 6838 /* 6839 * Kick start TX. 6840 */ 6841 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { 6842 struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; 6843 6844 /* 6845 * Use txeof task, so that any pending oactive can be 6846 * cleared properly. 6847 */ 6848 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); 6849 } 6850} 6851 6852static void 6853hn_resume_mgmt(struct hn_softc *sc) 6854{ 6855 6856 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; 6857 6858 /* 6859 * Kick off network change detection, if it was pending. 6860 * If no network change was pending, start link status 6861 * checks, which is more lightweight than network change 6862 * detection. 6863 */ 6864 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) 6865 hn_change_network(sc); 6866 else 6867 hn_update_link_status(sc); 6868} 6869 6870static void 6871hn_resume(struct hn_softc *sc) 6872{ 6873 6874 /* 6875 * If the non-transparent mode VF is activated, the synthetic 6876 * device have to receive packets, so the data path of the 6877 * synthetic device must be resumed. 6878 */ 6879 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || 6880 (sc->hn_flags & HN_FLAG_RXVF)) 6881 hn_resume_data(sc); 6882 6883 /* 6884 * Don't resume link status change if VF is attached/activated. 6885 * - In the non-transparent VF mode, the synthetic device marks 6886 * link down until the VF is deactivated; i.e. VF is down. 6887 * - In transparent VF mode, VF's media status is used until 6888 * the VF is detached. 6889 */ 6890 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && 6891 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) 6892 hn_resume_mgmt(sc); 6893 6894 /* 6895 * Re-enable polling if this interface is running and 6896 * the polling is requested. 6897 */ 6898 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) 6899 hn_polling(sc, sc->hn_pollhz); 6900} 6901 6902static void 6903hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) 6904{ 6905 const struct rndis_status_msg *msg; 6906 int ofs; 6907 6908 if (dlen < sizeof(*msg)) { 6909 if_printf(sc->hn_ifp, "invalid RNDIS status\n"); 6910 return; 6911 } 6912 msg = data; 6913 6914 switch (msg->rm_status) { 6915 case RNDIS_STATUS_MEDIA_CONNECT: 6916 case RNDIS_STATUS_MEDIA_DISCONNECT: 6917 hn_update_link_status(sc); 6918 break; 6919 6920 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: 6921 case RNDIS_STATUS_LINK_SPEED_CHANGE: 6922 /* Not really useful; ignore. */ 6923 break; 6924 6925 case RNDIS_STATUS_NETWORK_CHANGE: 6926 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); 6927 if (dlen < ofs + msg->rm_stbuflen || 6928 msg->rm_stbuflen < sizeof(uint32_t)) { 6929 if_printf(sc->hn_ifp, "network changed\n"); 6930 } else { 6931 uint32_t change; 6932 6933 memcpy(&change, ((const uint8_t *)msg) + ofs, 6934 sizeof(change)); 6935 if_printf(sc->hn_ifp, "network changed, change %u\n", 6936 change); 6937 } 6938 hn_change_network(sc); 6939 break; 6940 6941 default: 6942 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", 6943 msg->rm_status); 6944 break; 6945 } 6946} 6947 6948static int 6949hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) 6950{ 6951 const struct rndis_pktinfo *pi = info_data; 6952 uint32_t mask = 0; 6953 6954 while (info_dlen != 0) { 6955 const void *data; 6956 uint32_t dlen; 6957 6958 if (__predict_false(info_dlen < sizeof(*pi))) 6959 return (EINVAL); 6960 if (__predict_false(info_dlen < pi->rm_size)) 6961 return (EINVAL); 6962 info_dlen -= pi->rm_size; 6963 6964 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) 6965 return (EINVAL); 6966 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) 6967 return (EINVAL); 6968 dlen = pi->rm_size - pi->rm_pktinfooffset; 6969 data = pi->rm_data; 6970 6971 switch (pi->rm_type) { 6972 case NDIS_PKTINFO_TYPE_VLAN: 6973 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) 6974 return (EINVAL); 6975 info->vlan_info = *((const uint32_t *)data); 6976 mask |= HN_RXINFO_VLAN; 6977 break; 6978 6979 case NDIS_PKTINFO_TYPE_CSUM: 6980 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) 6981 return (EINVAL); 6982 info->csum_info = *((const uint32_t *)data); 6983 mask |= HN_RXINFO_CSUM; 6984 break; 6985 6986 case HN_NDIS_PKTINFO_TYPE_HASHVAL: 6987 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) 6988 return (EINVAL); 6989 info->hash_value = *((const uint32_t *)data); 6990 mask |= HN_RXINFO_HASHVAL; 6991 break; 6992 6993 case HN_NDIS_PKTINFO_TYPE_HASHINF: 6994 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) 6995 return (EINVAL); 6996 info->hash_info = *((const uint32_t *)data); 6997 mask |= HN_RXINFO_HASHINF; 6998 break; 6999 7000 default: 7001 goto next; 7002 } 7003 7004 if (mask == HN_RXINFO_ALL) { 7005 /* All found; done */ 7006 break; 7007 } 7008next: 7009 pi = (const struct rndis_pktinfo *) 7010 ((const uint8_t *)pi + pi->rm_size); 7011 } 7012 7013 /* 7014 * Final fixup. 7015 * - If there is no hash value, invalidate the hash info. 7016 */ 7017 if ((mask & HN_RXINFO_HASHVAL) == 0) 7018 info->hash_info = HN_NDIS_HASH_INFO_INVALID; 7019 return (0); 7020} 7021 7022static __inline bool 7023hn_rndis_check_overlap(int off, int len, int check_off, int check_len) 7024{ 7025 7026 if (off < check_off) { 7027 if (__predict_true(off + len <= check_off)) 7028 return (false); 7029 } else if (off > check_off) { 7030 if (__predict_true(check_off + check_len <= off)) 7031 return (false); 7032 } 7033 return (true); 7034} 7035 7036static void 7037hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) 7038{ 7039 const struct rndis_packet_msg *pkt; 7040 struct hn_rxinfo info; 7041 int data_off, pktinfo_off, data_len, pktinfo_len; 7042 7043 /* 7044 * Check length. 7045 */ 7046 if (__predict_false(dlen < sizeof(*pkt))) { 7047 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); 7048 return; 7049 } 7050 pkt = data; 7051 7052 if (__predict_false(dlen < pkt->rm_len)) { 7053 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " 7054 "dlen %d, msglen %u\n", dlen, pkt->rm_len); 7055 return; 7056 } 7057 if (__predict_false(pkt->rm_len < 7058 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { 7059 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " 7060 "msglen %u, data %u, oob %u, pktinfo %u\n", 7061 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, 7062 pkt->rm_pktinfolen); 7063 return; 7064 } 7065 if (__predict_false(pkt->rm_datalen == 0)) { 7066 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); 7067 return; 7068 } 7069 7070 /* 7071 * Check offests. 7072 */ 7073#define IS_OFFSET_INVALID(ofs) \ 7074 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ 7075 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) 7076 7077 /* XXX Hyper-V does not meet data offset alignment requirement */ 7078 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { 7079 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7080 "data offset %u\n", pkt->rm_dataoffset); 7081 return; 7082 } 7083 if (__predict_false(pkt->rm_oobdataoffset > 0 && 7084 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { 7085 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7086 "oob offset %u\n", pkt->rm_oobdataoffset); 7087 return; 7088 } 7089 if (__predict_true(pkt->rm_pktinfooffset > 0) && 7090 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { 7091 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7092 "pktinfo offset %u\n", pkt->rm_pktinfooffset); 7093 return; 7094 } 7095 7096#undef IS_OFFSET_INVALID 7097 7098 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); 7099 data_len = pkt->rm_datalen; 7100 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); 7101 pktinfo_len = pkt->rm_pktinfolen; 7102 7103 /* 7104 * Check OOB coverage. 7105 */ 7106 if (__predict_false(pkt->rm_oobdatalen != 0)) { 7107 int oob_off, oob_len; 7108 7109 if_printf(rxr->hn_ifp, "got oobdata\n"); 7110 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); 7111 oob_len = pkt->rm_oobdatalen; 7112 7113 if (__predict_false(oob_off + oob_len > pkt->rm_len)) { 7114 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7115 "oob overflow, msglen %u, oob abs %d len %d\n", 7116 pkt->rm_len, oob_off, oob_len); 7117 return; 7118 } 7119 7120 /* 7121 * Check against data. 7122 */ 7123 if (hn_rndis_check_overlap(oob_off, oob_len, 7124 data_off, data_len)) { 7125 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7126 "oob overlaps data, oob abs %d len %d, " 7127 "data abs %d len %d\n", 7128 oob_off, oob_len, data_off, data_len); 7129 return; 7130 } 7131 7132 /* 7133 * Check against pktinfo. 7134 */ 7135 if (pktinfo_len != 0 && 7136 hn_rndis_check_overlap(oob_off, oob_len, 7137 pktinfo_off, pktinfo_len)) { 7138 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7139 "oob overlaps pktinfo, oob abs %d len %d, " 7140 "pktinfo abs %d len %d\n", 7141 oob_off, oob_len, pktinfo_off, pktinfo_len); 7142 return; 7143 } 7144 } 7145 7146 /* 7147 * Check per-packet-info coverage and find useful per-packet-info. 7148 */ 7149 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; 7150 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; 7151 info.hash_info = HN_NDIS_HASH_INFO_INVALID; 7152 if (__predict_true(pktinfo_len != 0)) { 7153 bool overlap; 7154 int error; 7155 7156 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { 7157 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7158 "pktinfo overflow, msglen %u, " 7159 "pktinfo abs %d len %d\n", 7160 pkt->rm_len, pktinfo_off, pktinfo_len); 7161 return; 7162 } 7163 7164 /* 7165 * Check packet info coverage. 7166 */ 7167 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, 7168 data_off, data_len); 7169 if (__predict_false(overlap)) { 7170 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7171 "pktinfo overlap data, pktinfo abs %d len %d, " 7172 "data abs %d len %d\n", 7173 pktinfo_off, pktinfo_len, data_off, data_len); 7174 return; 7175 } 7176 7177 /* 7178 * Find useful per-packet-info. 7179 */ 7180 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, 7181 pktinfo_len, &info); 7182 if (__predict_false(error)) { 7183 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " 7184 "pktinfo\n"); 7185 return; 7186 } 7187 } 7188 7189 if (__predict_false(data_off + data_len > pkt->rm_len)) { 7190 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " 7191 "data overflow, msglen %u, data abs %d len %d\n", 7192 pkt->rm_len, data_off, data_len); 7193 return; 7194 } 7195 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); 7196} 7197 7198static __inline void 7199hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) 7200{ 7201 const struct rndis_msghdr *hdr; 7202 7203 if (__predict_false(dlen < sizeof(*hdr))) { 7204 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); 7205 return; 7206 } 7207 hdr = data; 7208 7209 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { 7210 /* Hot data path. */ 7211 hn_rndis_rx_data(rxr, data, dlen); 7212 /* Done! */ 7213 return; 7214 } 7215 7216 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) 7217 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); 7218 else 7219 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); 7220} 7221 7222static void 7223hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) 7224{ 7225 const struct hn_nvs_hdr *hdr; 7226 7227 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { 7228 if_printf(sc->hn_ifp, "invalid nvs notify\n"); 7229 return; 7230 } 7231 hdr = VMBUS_CHANPKT_CONST_DATA(pkt); 7232 7233 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { 7234 /* Useless; ignore */ 7235 return; 7236 } 7237 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); 7238} 7239 7240static void 7241hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, 7242 const struct vmbus_chanpkt_hdr *pkt) 7243{ 7244 struct hn_nvs_sendctx *sndc; 7245 7246 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; 7247 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), 7248 VMBUS_CHANPKT_DATALEN(pkt)); 7249 /* 7250 * NOTE: 7251 * 'sndc' CAN NOT be accessed anymore, since it can be freed by 7252 * its callback. 7253 */ 7254} 7255 7256static void 7257hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7258 const struct vmbus_chanpkt_hdr *pkthdr) 7259{ 7260 const struct vmbus_chanpkt_rxbuf *pkt; 7261 const struct hn_nvs_hdr *nvs_hdr; 7262 int count, i, hlen; 7263 7264 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { 7265 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); 7266 return; 7267 } 7268 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); 7269 7270 /* Make sure that this is a RNDIS message. */ 7271 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { 7272 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", 7273 nvs_hdr->nvs_type); 7274 return; 7275 } 7276 7277 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); 7278 if (__predict_false(hlen < sizeof(*pkt))) { 7279 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); 7280 return; 7281 } 7282 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; 7283 7284 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { 7285 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", 7286 pkt->cp_rxbuf_id); 7287 return; 7288 } 7289 7290 count = pkt->cp_rxbuf_cnt; 7291 if (__predict_false(hlen < 7292 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { 7293 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); 7294 return; 7295 } 7296 7297 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ 7298 for (i = 0; i < count; ++i) { 7299 int ofs, len; 7300 7301 ofs = pkt->cp_rxbuf[i].rb_ofs; 7302 len = pkt->cp_rxbuf[i].rb_len; 7303 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { 7304 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " 7305 "ofs %d, len %d\n", i, ofs, len); 7306 continue; 7307 } 7308 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); 7309 } 7310 7311 /* 7312 * Ack the consumed RXBUF associated w/ this channel packet, 7313 * so that this RXBUF can be recycled by the hypervisor. 7314 */ 7315 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); 7316} 7317 7318static void 7319hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, 7320 uint64_t tid) 7321{ 7322 struct hn_nvs_rndis_ack ack; 7323 int retries, error; 7324 7325 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; 7326 ack.nvs_status = HN_NVS_STATUS_OK; 7327 7328 retries = 0; 7329again: 7330 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 7331 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); 7332 if (__predict_false(error == EAGAIN)) { 7333 /* 7334 * NOTE: 7335 * This should _not_ happen in real world, since the 7336 * consumption of the TX bufring from the TX path is 7337 * controlled. 7338 */ 7339 if (rxr->hn_ack_failed == 0) 7340 if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); 7341 rxr->hn_ack_failed++; 7342 retries++; 7343 if (retries < 10) { 7344 DELAY(100); 7345 goto again; 7346 } 7347 /* RXBUF leaks! */ 7348 if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); 7349 } 7350} 7351 7352static void 7353hn_chan_callback(struct vmbus_channel *chan, void *xrxr) 7354{ 7355 struct hn_rx_ring *rxr = xrxr; 7356 struct hn_softc *sc = rxr->hn_ifp->if_softc; 7357 7358 for (;;) { 7359 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; 7360 int error, pktlen; 7361 7362 pktlen = rxr->hn_pktbuf_len; 7363 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); 7364 if (__predict_false(error == ENOBUFS)) { 7365 void *nbuf; 7366 int nlen; 7367 7368 /* 7369 * Expand channel packet buffer. 7370 * 7371 * XXX 7372 * Use M_WAITOK here, since allocation failure 7373 * is fatal. 7374 */ 7375 nlen = rxr->hn_pktbuf_len * 2; 7376 while (nlen < pktlen) 7377 nlen *= 2; 7378 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); 7379 7380 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", 7381 rxr->hn_pktbuf_len, nlen); 7382 7383 free(rxr->hn_pktbuf, M_DEVBUF); 7384 rxr->hn_pktbuf = nbuf; 7385 rxr->hn_pktbuf_len = nlen; 7386 /* Retry! */ 7387 continue; 7388 } else if (__predict_false(error == EAGAIN)) { 7389 /* No more channel packets; done! */ 7390 break; 7391 } 7392 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); 7393 7394 switch (pkt->cph_type) { 7395 case VMBUS_CHANPKT_TYPE_COMP: 7396 hn_nvs_handle_comp(sc, chan, pkt); 7397 break; 7398 7399 case VMBUS_CHANPKT_TYPE_RXBUF: 7400 hn_nvs_handle_rxbuf(rxr, chan, pkt); 7401 break; 7402 7403 case VMBUS_CHANPKT_TYPE_INBAND: 7404 hn_nvs_handle_notify(sc, pkt); 7405 break; 7406 7407 default: 7408 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", 7409 pkt->cph_type); 7410 break; 7411 } 7412 } 7413 hn_chan_rollup(rxr, rxr->hn_txr); 7414} 7415 7416static void 7417hn_sysinit(void *arg __unused) 7418{ 7419 int i; 7420 7421 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); 7422 7423#ifdef HN_IFSTART_SUPPORT 7424 /* 7425 * Don't use ifnet.if_start if transparent VF mode is requested; 7426 * mainly due to the IFF_DRV_OACTIVE flag. 7427 */ 7428 if (hn_xpnt_vf && hn_use_if_start) { 7429 hn_use_if_start = 0; 7430 printf("hn: tranparent VF mode, if_transmit will be used, " 7431 "instead of if_start\n"); 7432 } 7433#endif 7434 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { 7435 printf("hn: invalid transparent VF attach routing " 7436 "wait timeout %d, reset to %d\n", 7437 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); 7438 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; 7439 } 7440 7441 /* 7442 * Initialize VF map. 7443 */ 7444 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); 7445 hn_vfmap_size = HN_VFMAP_SIZE_DEF; 7446 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF, 7447 M_WAITOK | M_ZERO); 7448 7449 /* 7450 * Fix the # of TX taskqueues. 7451 */ 7452 if (hn_tx_taskq_cnt <= 0) 7453 hn_tx_taskq_cnt = 1; 7454 else if (hn_tx_taskq_cnt > mp_ncpus) 7455 hn_tx_taskq_cnt = mp_ncpus; 7456 7457 /* 7458 * Fix the TX taskqueue mode. 7459 */ 7460 switch (hn_tx_taskq_mode) { 7461 case HN_TX_TASKQ_M_INDEP: 7462 case HN_TX_TASKQ_M_GLOBAL: 7463 case HN_TX_TASKQ_M_EVTTQ: 7464 break; 7465 default: 7466 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; 7467 break; 7468 } 7469 7470 if (vm_guest != VM_GUEST_HV) 7471 return; 7472 7473 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) 7474 return; 7475 7476 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), 7477 M_DEVBUF, M_WAITOK); 7478 for (i = 0; i < hn_tx_taskq_cnt; ++i) { 7479 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, 7480 taskqueue_thread_enqueue, &hn_tx_taskque[i]); 7481 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, 7482 "hn tx%d", i); 7483 } 7484} 7485SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); 7486 7487static void 7488hn_sysuninit(void *arg __unused) 7489{ 7490 7491 if (hn_tx_taskque != NULL) { 7492 int i; 7493 7494 for (i = 0; i < hn_tx_taskq_cnt; ++i) 7495 taskqueue_free(hn_tx_taskque[i]); 7496 free(hn_tx_taskque, M_DEVBUF); 7497 } 7498 7499 if (hn_vfmap != NULL) 7500 free(hn_vfmap, M_DEVBUF); 7501 rm_destroy(&hn_vfmap_lock); 7502 7503 counter_u64_free(hn_udpcs_fixup); 7504} 7505SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); 7506