1/*-
2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice unmodified, this list of conditions, and the following
12 *    disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*-
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 *    notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 *    notice, this list of conditions and the following disclaimer in the
40 *    documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 */
54
55#include <sys/cdefs.h>
56__FBSDID("$FreeBSD: stable/10/sys/dev/hyperv/netvsc/if_hn.c 356412 2020-01-06 09:51:54Z hselasky $");
57
58#include "opt_inet6.h"
59#include "opt_inet.h"
60#include "opt_hn.h"
61
62#include <sys/param.h>
63#include <sys/systm.h>
64#include <sys/bus.h>
65#include <sys/counter.h>
66#include <sys/kernel.h>
67#include <sys/limits.h>
68#include <sys/malloc.h>
69#include <sys/mbuf.h>
70#include <sys/module.h>
71#include <sys/proc.h>
72#include <sys/queue.h>
73#include <sys/lock.h>
74#include <sys/rmlock.h>
75#include <sys/sbuf.h>
76#include <sys/smp.h>
77#include <sys/socket.h>
78#include <sys/sockio.h>
79#include <sys/sx.h>
80#include <sys/sysctl.h>
81#include <sys/taskqueue.h>
82#include <sys/buf_ring.h>
83#include <sys/eventhandler.h>
84
85#include <machine/atomic.h>
86#include <machine/in_cksum.h>
87
88#include <net/bpf.h>
89#include <net/ethernet.h>
90#include <net/if.h>
91#include <net/if_arp.h>
92#include <net/if_dl.h>
93#include <net/if_media.h>
94#include <net/if_types.h>
95#include <net/if_var.h>
96#include <net/if_vlan_var.h>
97#include <net/rndis.h>
98
99#include <netinet/in_systm.h>
100#include <netinet/in.h>
101#include <netinet/ip.h>
102#include <netinet/ip6.h>
103#include <netinet/tcp.h>
104#include <netinet/tcp_lro.h>
105#include <netinet/udp.h>
106
107#include <dev/hyperv/include/hyperv.h>
108#include <dev/hyperv/include/hyperv_busdma.h>
109#include <dev/hyperv/include/vmbus.h>
110#include <dev/hyperv/include/vmbus_xact.h>
111
112#include <dev/hyperv/netvsc/ndis.h>
113#include <dev/hyperv/netvsc/if_hnreg.h>
114#include <dev/hyperv/netvsc/if_hnvar.h>
115#include <dev/hyperv/netvsc/hn_nvs.h>
116#include <dev/hyperv/netvsc/hn_rndis.h>
117
118#include "vmbus_if.h"
119
120#define HN_IFSTART_SUPPORT
121
122/* NOTE: M_HASHTYPE_RSS_UDP_IPV4 is not available on stable/10. */
123#ifndef M_HASHTYPE_RSS_UDP_IPV4
124#define M_HASHTYPE_RSS_UDP_IPV4		M_HASHTYPE_OPAQUE
125#endif
126
127#define HN_RING_CNT_DEF_MAX		8
128
129#define HN_VFMAP_SIZE_DEF		8
130
131#define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
132
133/* YYY should get it from the underlying channel */
134#define HN_TX_DESC_CNT			512
135
136#define HN_RNDIS_PKT_LEN					\
137	(sizeof(struct rndis_packet_msg) +			\
138	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
139	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
140	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
141	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
142#define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
143#define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
144
145#define HN_TX_DATA_BOUNDARY		PAGE_SIZE
146#define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
147#define HN_TX_DATA_SEGSIZE		PAGE_SIZE
148/* -1 for RNDIS packet message */
149#define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
150
151#define HN_DIRECT_TX_SIZE_DEF		128
152
153#define HN_EARLY_TXEOF_THRESH		8
154
155#define HN_PKTBUF_LEN_DEF		(16 * 1024)
156
157#define HN_LROENT_CNT_DEF		128
158
159#define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
160#define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
161/* YYY 2*MTU is a bit rough, but should be good enough. */
162#define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
163
164#define HN_LRO_ACKCNT_DEF		1
165
166#define HN_LOCK_INIT(sc)		\
167	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
168#define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
169#define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
170#define HN_LOCK(sc)					\
171do {							\
172	while (sx_try_xlock(&(sc)->hn_lock) == 0)	\
173		DELAY(1000);				\
174} while (0)
175#define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
176
177#define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
178#define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
179#define HN_CSUM_IP_HWASSIST(sc)		\
180	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
181#define HN_CSUM_IP6_HWASSIST(sc)	\
182	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
183
184#define HN_PKTSIZE_MIN(align)		\
185	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
186	    HN_RNDIS_PKT_LEN, (align))
187#define HN_PKTSIZE(m, align)		\
188	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
189
190#define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
191
192struct hn_txdesc {
193#ifndef HN_USE_TXDESC_BUFRING
194	SLIST_ENTRY(hn_txdesc)		link;
195#endif
196	STAILQ_ENTRY(hn_txdesc)		agg_link;
197
198	/* Aggregated txdescs, in sending order. */
199	STAILQ_HEAD(, hn_txdesc)	agg_list;
200
201	/* The oldest packet, if transmission aggregation happens. */
202	struct mbuf			*m;
203	struct hn_tx_ring		*txr;
204	int				refs;
205	uint32_t			flags;	/* HN_TXD_FLAG_ */
206	struct hn_nvs_sendctx		send_ctx;
207	uint32_t			chim_index;
208	int				chim_size;
209
210	bus_dmamap_t			data_dmap;
211
212	bus_addr_t			rndis_pkt_paddr;
213	struct rndis_packet_msg		*rndis_pkt;
214	bus_dmamap_t			rndis_pkt_dmap;
215};
216
217#define HN_TXD_FLAG_ONLIST		0x0001
218#define HN_TXD_FLAG_DMAMAP		0x0002
219#define HN_TXD_FLAG_ONAGG		0x0004
220
221struct hn_rxinfo {
222	uint32_t			vlan_info;
223	uint32_t			csum_info;
224	uint32_t			hash_info;
225	uint32_t			hash_value;
226};
227
228struct hn_rxvf_setarg {
229	struct hn_rx_ring	*rxr;
230	struct ifnet		*vf_ifp;
231};
232
233#define HN_RXINFO_VLAN			0x0001
234#define HN_RXINFO_CSUM			0x0002
235#define HN_RXINFO_HASHINF		0x0004
236#define HN_RXINFO_HASHVAL		0x0008
237#define HN_RXINFO_ALL			\
238	(HN_RXINFO_VLAN |		\
239	 HN_RXINFO_CSUM |		\
240	 HN_RXINFO_HASHINF |		\
241	 HN_RXINFO_HASHVAL)
242
243#define HN_NDIS_VLAN_INFO_INVALID	0xffffffff
244#define HN_NDIS_RXCSUM_INFO_INVALID	0
245#define HN_NDIS_HASH_INFO_INVALID	0
246
247static int			hn_probe(device_t);
248static int			hn_attach(device_t);
249static int			hn_detach(device_t);
250static int			hn_shutdown(device_t);
251static void			hn_chan_callback(struct vmbus_channel *,
252				    void *);
253
254static void			hn_init(void *);
255static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
256#ifdef HN_IFSTART_SUPPORT
257static void			hn_start(struct ifnet *);
258#endif
259static int			hn_transmit(struct ifnet *, struct mbuf *);
260static void			hn_xmit_qflush(struct ifnet *);
261static int			hn_ifmedia_upd(struct ifnet *);
262static void			hn_ifmedia_sts(struct ifnet *,
263				    struct ifmediareq *);
264
265static void			hn_ifnet_event(void *, struct ifnet *, int);
266static void			hn_ifaddr_event(void *, struct ifnet *);
267static void			hn_ifnet_attevent(void *, struct ifnet *);
268static void			hn_ifnet_detevent(void *, struct ifnet *);
269static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
270
271static bool			hn_ismyvf(const struct hn_softc *,
272				    const struct ifnet *);
273static void			hn_rxvf_change(struct hn_softc *,
274				    struct ifnet *, bool);
275static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
276static void			hn_rxvf_set_task(void *, int);
277static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
278static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
279static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
280				    struct ifreq *);
281static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
282static bool			hn_xpnt_vf_isready(struct hn_softc *);
283static void			hn_xpnt_vf_setready(struct hn_softc *);
284static void			hn_xpnt_vf_init_taskfunc(void *, int);
285static void			hn_xpnt_vf_init(struct hn_softc *);
286static void			hn_xpnt_vf_setenable(struct hn_softc *);
287static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
288static void			hn_vf_rss_fixup(struct hn_softc *, bool);
289static void			hn_vf_rss_restore(struct hn_softc *);
290
291static int			hn_rndis_rxinfo(const void *, int,
292				    struct hn_rxinfo *);
293static void			hn_rndis_rx_data(struct hn_rx_ring *,
294				    const void *, int);
295static void			hn_rndis_rx_status(struct hn_softc *,
296				    const void *, int);
297static void			hn_rndis_init_fixat(struct hn_softc *, int);
298
299static void			hn_nvs_handle_notify(struct hn_softc *,
300				    const struct vmbus_chanpkt_hdr *);
301static void			hn_nvs_handle_comp(struct hn_softc *,
302				    struct vmbus_channel *,
303				    const struct vmbus_chanpkt_hdr *);
304static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
305				    struct vmbus_channel *,
306				    const struct vmbus_chanpkt_hdr *);
307static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
308				    struct vmbus_channel *, uint64_t);
309
310#if __FreeBSD_version >= 1100099
311static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
312static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
313#endif
314static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
315static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
316#if __FreeBSD_version < 1100095
317static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
318#else
319static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
320#endif
321static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
322static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
323static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
324static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
325static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
326static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
327static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
328static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
329static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
330static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
331static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
332static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
333static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
334static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
335static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
336static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
337static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
338static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
339static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
340static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
341static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
342static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
343static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
344
345static void			hn_stop(struct hn_softc *, bool);
346static void			hn_init_locked(struct hn_softc *);
347static int			hn_chan_attach(struct hn_softc *,
348				    struct vmbus_channel *);
349static void			hn_chan_detach(struct hn_softc *,
350				    struct vmbus_channel *);
351static int			hn_attach_subchans(struct hn_softc *);
352static void			hn_detach_allchans(struct hn_softc *);
353static void			hn_chan_rollup(struct hn_rx_ring *,
354				    struct hn_tx_ring *);
355static void			hn_set_ring_inuse(struct hn_softc *, int);
356static int			hn_synth_attach(struct hn_softc *, int);
357static void			hn_synth_detach(struct hn_softc *);
358static int			hn_synth_alloc_subchans(struct hn_softc *,
359				    int *);
360static bool			hn_synth_attachable(const struct hn_softc *);
361static void			hn_suspend(struct hn_softc *);
362static void			hn_suspend_data(struct hn_softc *);
363static void			hn_suspend_mgmt(struct hn_softc *);
364static void			hn_resume(struct hn_softc *);
365static void			hn_resume_data(struct hn_softc *);
366static void			hn_resume_mgmt(struct hn_softc *);
367static void			hn_suspend_mgmt_taskfunc(void *, int);
368static void			hn_chan_drain(struct hn_softc *,
369				    struct vmbus_channel *);
370static void			hn_disable_rx(struct hn_softc *);
371static void			hn_drain_rxtx(struct hn_softc *, int);
372static void			hn_polling(struct hn_softc *, u_int);
373static void			hn_chan_polling(struct vmbus_channel *, u_int);
374static void			hn_mtu_change_fixup(struct hn_softc *);
375
376static void			hn_update_link_status(struct hn_softc *);
377static void			hn_change_network(struct hn_softc *);
378static void			hn_link_taskfunc(void *, int);
379static void			hn_netchg_init_taskfunc(void *, int);
380static void			hn_netchg_status_taskfunc(void *, int);
381static void			hn_link_status(struct hn_softc *);
382
383static int			hn_create_rx_data(struct hn_softc *, int);
384static void			hn_destroy_rx_data(struct hn_softc *);
385static int			hn_check_iplen(const struct mbuf *, int);
386static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
387static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
388static int			hn_rxfilter_config(struct hn_softc *);
389static int			hn_rss_reconfig(struct hn_softc *);
390static void			hn_rss_ind_fixup(struct hn_softc *);
391static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
392static int			hn_rxpkt(struct hn_rx_ring *, const void *,
393				    int, const struct hn_rxinfo *);
394static uint32_t			hn_rss_type_fromndis(uint32_t);
395static uint32_t			hn_rss_type_tondis(uint32_t);
396
397static int			hn_tx_ring_create(struct hn_softc *, int);
398static void			hn_tx_ring_destroy(struct hn_tx_ring *);
399static int			hn_create_tx_data(struct hn_softc *, int);
400static void			hn_fixup_tx_data(struct hn_softc *);
401static void			hn_fixup_rx_data(struct hn_softc *);
402static void			hn_destroy_tx_data(struct hn_softc *);
403static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
404static void			hn_txdesc_gc(struct hn_tx_ring *,
405				    struct hn_txdesc *);
406static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
407				    struct hn_txdesc *, struct mbuf **);
408static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
409				    struct hn_txdesc *);
410static void			hn_set_chim_size(struct hn_softc *, int);
411static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
412static bool			hn_tx_ring_pending(struct hn_tx_ring *);
413static void			hn_tx_ring_qflush(struct hn_tx_ring *);
414static void			hn_resume_tx(struct hn_softc *, int);
415static void			hn_set_txagg(struct hn_softc *);
416static void			*hn_try_txagg(struct ifnet *,
417				    struct hn_tx_ring *, struct hn_txdesc *,
418				    int);
419static int			hn_get_txswq_depth(const struct hn_tx_ring *);
420static void			hn_txpkt_done(struct hn_nvs_sendctx *,
421				    struct hn_softc *, struct vmbus_channel *,
422				    const void *, int);
423static int			hn_txpkt_sglist(struct hn_tx_ring *,
424				    struct hn_txdesc *);
425static int			hn_txpkt_chim(struct hn_tx_ring *,
426				    struct hn_txdesc *);
427static int			hn_xmit(struct hn_tx_ring *, int);
428static void			hn_xmit_taskfunc(void *, int);
429static void			hn_xmit_txeof(struct hn_tx_ring *);
430static void			hn_xmit_txeof_taskfunc(void *, int);
431#ifdef HN_IFSTART_SUPPORT
432static int			hn_start_locked(struct hn_tx_ring *, int);
433static void			hn_start_taskfunc(void *, int);
434static void			hn_start_txeof(struct hn_tx_ring *);
435static void			hn_start_txeof_taskfunc(void *, int);
436#endif
437
438SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
439    "Hyper-V network interface");
440
441/* Trust tcp segements verification on host side. */
442static int			hn_trust_hosttcp = 1;
443SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
444    &hn_trust_hosttcp, 0,
445    "Trust tcp segement verification on host side, "
446    "when csum info is missing (global setting)");
447
448/* Trust udp datagrams verification on host side. */
449static int			hn_trust_hostudp = 1;
450SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
451    &hn_trust_hostudp, 0,
452    "Trust udp datagram verification on host side, "
453    "when csum info is missing (global setting)");
454
455/* Trust ip packets verification on host side. */
456static int			hn_trust_hostip = 1;
457SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
458    &hn_trust_hostip, 0,
459    "Trust ip packet verification on host side, "
460    "when csum info is missing (global setting)");
461
462/*
463 * Offload UDP/IPv4 checksum.
464 */
465static int			hn_enable_udp4cs = 1;
466SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
467    &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
468
469/*
470 * Offload UDP/IPv6 checksum.
471 */
472static int			hn_enable_udp6cs = 1;
473SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
474    &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
475
476/* Stats. */
477static counter_u64_t		hn_udpcs_fixup;
478SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
479    &hn_udpcs_fixup, "# of UDP checksum fixup");
480
481/*
482 * See hn_set_hlen().
483 *
484 * This value is for Azure.  For Hyper-V, set this above
485 * 65536 to disable UDP datagram checksum fixup.
486 */
487static int			hn_udpcs_fixup_mtu = 1420;
488SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
489    &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
490
491/* Limit TSO burst size */
492static int			hn_tso_maxlen = IP_MAXPACKET;
493SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
494    &hn_tso_maxlen, 0, "TSO burst limit");
495
496/* Limit chimney send size */
497static int			hn_tx_chimney_size = 0;
498SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
499    &hn_tx_chimney_size, 0, "Chimney send packet size limit");
500
501/* Limit the size of packet for direct transmission */
502static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
503SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
504    &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
505
506/* # of LRO entries per RX ring */
507#if defined(INET) || defined(INET6)
508#if __FreeBSD_version >= 1100095
509static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
510SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
511    &hn_lro_entry_count, 0, "LRO entry count");
512#endif
513#endif
514
515static int			hn_tx_taskq_cnt = 1;
516SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
517    &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
518
519#define HN_TX_TASKQ_M_INDEP	0
520#define HN_TX_TASKQ_M_GLOBAL	1
521#define HN_TX_TASKQ_M_EVTTQ	2
522
523static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
524SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
525    &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
526    "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
527
528#ifndef HN_USE_TXDESC_BUFRING
529static int			hn_use_txdesc_bufring = 0;
530#else
531static int			hn_use_txdesc_bufring = 1;
532#endif
533SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
534    &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
535
536#ifdef HN_IFSTART_SUPPORT
537/* Use ifnet.if_start instead of ifnet.if_transmit */
538static int			hn_use_if_start = 0;
539SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
540    &hn_use_if_start, 0, "Use if_start TX method");
541#endif
542
543/* # of channels to use */
544static int			hn_chan_cnt = 0;
545SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
546    &hn_chan_cnt, 0,
547    "# of channels to use; each channel has one RX ring and one TX ring");
548
549/* # of transmit rings to use */
550static int			hn_tx_ring_cnt = 0;
551SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
552    &hn_tx_ring_cnt, 0, "# of TX rings to use");
553
554/* Software TX ring deptch */
555static int			hn_tx_swq_depth = 0;
556SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
557    &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
558
559/* Enable sorted LRO, and the depth of the per-channel mbuf queue */
560#if __FreeBSD_version >= 1100095
561static u_int			hn_lro_mbufq_depth = 0;
562SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
563    &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
564#endif
565
566/* Packet transmission aggregation size limit */
567static int			hn_tx_agg_size = -1;
568SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
569    &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
570
571/* Packet transmission aggregation count limit */
572static int			hn_tx_agg_pkts = -1;
573SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
574    &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
575
576/* VF list */
577SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
578    0, 0, hn_vflist_sysctl, "A", "VF list");
579
580/* VF mapping */
581SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
582    0, 0, hn_vfmap_sysctl, "A", "VF mapping");
583
584/* Transparent VF */
585static int			hn_xpnt_vf = 1;
586SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
587    &hn_xpnt_vf, 0, "Transparent VF mod");
588
589/* Accurate BPF support for Transparent VF */
590static int			hn_xpnt_vf_accbpf = 0;
591SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
592    &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
593
594/* Extra wait for transparent VF attach routing; unit seconds. */
595static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
596SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
597    &hn_xpnt_vf_attwait, 0,
598    "Extra wait for transparent VF attach routing; unit: seconds");
599
600static u_int			hn_cpu_index;	/* next CPU for channel */
601static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
602
603static struct rmlock		hn_vfmap_lock;
604static int			hn_vfmap_size;
605static struct ifnet		**hn_vfmap;
606
607static const uint8_t
608hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
609	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
610	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
611	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
612	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
613	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
614};
615
616static const struct hyperv_guid	hn_guid = {
617	.hv_guid = {
618	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
619	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
620};
621
622static device_method_t hn_methods[] = {
623	/* Device interface */
624	DEVMETHOD(device_probe,		hn_probe),
625	DEVMETHOD(device_attach,	hn_attach),
626	DEVMETHOD(device_detach,	hn_detach),
627	DEVMETHOD(device_shutdown,	hn_shutdown),
628	DEVMETHOD_END
629};
630
631static driver_t hn_driver = {
632	"hn",
633	hn_methods,
634	sizeof(struct hn_softc)
635};
636
637static devclass_t hn_devclass;
638
639DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
640MODULE_VERSION(hn, 1);
641MODULE_DEPEND(hn, vmbus, 1, 1, 1);
642
643#if __FreeBSD_version >= 1100099
644static void
645hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
646{
647	int i;
648
649	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
650		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
651}
652#endif
653
654static int
655hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
656{
657
658	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
659	    txd->chim_size == 0, ("invalid rndis sglist txd"));
660	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
661	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
662}
663
664static int
665hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
666{
667	struct hn_nvs_rndis rndis;
668
669	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
670	    txd->chim_size > 0, ("invalid rndis chim txd"));
671
672	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
673	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
674	rndis.nvs_chim_idx = txd->chim_index;
675	rndis.nvs_chim_sz = txd->chim_size;
676
677	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
678	    &rndis, sizeof(rndis), &txd->send_ctx));
679}
680
681static __inline uint32_t
682hn_chim_alloc(struct hn_softc *sc)
683{
684	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
685	u_long *bmap = sc->hn_chim_bmap;
686	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
687
688	for (i = 0; i < bmap_cnt; ++i) {
689		int idx;
690
691		idx = ffsl(~bmap[i]);
692		if (idx == 0)
693			continue;
694
695		--idx; /* ffsl is 1-based */
696		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
697		    ("invalid i %d and idx %d", i, idx));
698
699		if (atomic_testandset_long(&bmap[i], idx))
700			continue;
701
702		ret = i * LONG_BIT + idx;
703		break;
704	}
705	return (ret);
706}
707
708static __inline void
709hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
710{
711	u_long mask;
712	uint32_t idx;
713
714	idx = chim_idx / LONG_BIT;
715	KASSERT(idx < sc->hn_chim_bmap_cnt,
716	    ("invalid chimney index 0x%x", chim_idx));
717
718	mask = 1UL << (chim_idx % LONG_BIT);
719	KASSERT(sc->hn_chim_bmap[idx] & mask,
720	    ("index bitmap 0x%lx, chimney index %u, "
721	     "bitmap idx %d, bitmask 0x%lx",
722	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
723
724	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
725}
726
727#if defined(INET6) || defined(INET)
728
729#define PULLUP_HDR(m, len)				\
730do {							\
731	if (__predict_false((m)->m_len < (len))) {	\
732		(m) = m_pullup((m), (len));		\
733		if ((m) == NULL)			\
734			return (NULL);			\
735	}						\
736} while (0)
737
738/*
739 * NOTE: If this function failed, the m_head would be freed.
740 */
741static __inline struct mbuf *
742hn_tso_fixup(struct mbuf *m_head)
743{
744	struct ether_vlan_header *evl;
745	struct tcphdr *th;
746	int ehlen;
747
748	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
749
750	PULLUP_HDR(m_head, sizeof(*evl));
751	evl = mtod(m_head, struct ether_vlan_header *);
752	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
753		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
754	else
755		ehlen = ETHER_HDR_LEN;
756	m_head->m_pkthdr.l2hlen = ehlen;
757
758#ifdef INET
759	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
760		struct ip *ip;
761		int iphlen;
762
763		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
764		ip = mtodo(m_head, ehlen);
765		iphlen = ip->ip_hl << 2;
766		m_head->m_pkthdr.l3hlen = iphlen;
767
768		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
769		th = mtodo(m_head, ehlen + iphlen);
770
771		ip->ip_len = 0;
772		ip->ip_sum = 0;
773		th->th_sum = in_pseudo(ip->ip_src.s_addr,
774		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
775	}
776#endif
777#if defined(INET6) && defined(INET)
778	else
779#endif
780#ifdef INET6
781	{
782		struct ip6_hdr *ip6;
783
784		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
785		ip6 = mtodo(m_head, ehlen);
786		if (ip6->ip6_nxt != IPPROTO_TCP) {
787			m_freem(m_head);
788			return (NULL);
789		}
790		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
791
792		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
793		th = mtodo(m_head, ehlen + sizeof(*ip6));
794
795		ip6->ip6_plen = 0;
796		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
797	}
798#endif
799	return (m_head);
800}
801
802/*
803 * NOTE: If this function failed, the m_head would be freed.
804 */
805static __inline struct mbuf *
806hn_set_hlen(struct mbuf *m_head)
807{
808	const struct ether_vlan_header *evl;
809	int ehlen;
810
811	PULLUP_HDR(m_head, sizeof(*evl));
812	evl = mtod(m_head, const struct ether_vlan_header *);
813	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
814		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
815	else
816		ehlen = ETHER_HDR_LEN;
817	m_head->m_pkthdr.l2hlen = ehlen;
818
819#ifdef INET
820	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
821		const struct ip *ip;
822		int iphlen;
823
824		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
825		ip = mtodo(m_head, ehlen);
826		iphlen = ip->ip_hl << 2;
827		m_head->m_pkthdr.l3hlen = iphlen;
828
829		/*
830		 * UDP checksum offload does not work in Azure, if the
831		 * following conditions meet:
832		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
833		 * - IP_DF is not set in the IP hdr.
834		 *
835		 * Fallback to software checksum for these UDP datagrams.
836		 */
837		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
838		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
839		    (ntohs(ip->ip_off) & IP_DF) == 0) {
840			uint16_t off = ehlen + iphlen;
841
842			counter_u64_add(hn_udpcs_fixup, 1);
843			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
844			*(uint16_t *)(m_head->m_data + off +
845                            m_head->m_pkthdr.csum_data) = in_cksum_skip(
846			    m_head, m_head->m_pkthdr.len, off);
847			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
848		}
849	}
850#endif
851#if defined(INET6) && defined(INET)
852	else
853#endif
854#ifdef INET6
855	{
856		const struct ip6_hdr *ip6;
857
858		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
859		ip6 = mtodo(m_head, ehlen);
860		if (ip6->ip6_nxt != IPPROTO_TCP &&
861		    ip6->ip6_nxt != IPPROTO_UDP) {
862			m_freem(m_head);
863			return (NULL);
864		}
865		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
866	}
867#endif
868	return (m_head);
869}
870
871/*
872 * NOTE: If this function failed, the m_head would be freed.
873 */
874static __inline struct mbuf *
875hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
876{
877	const struct tcphdr *th;
878	int ehlen, iphlen;
879
880	*tcpsyn = 0;
881	ehlen = m_head->m_pkthdr.l2hlen;
882	iphlen = m_head->m_pkthdr.l3hlen;
883
884	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
885	th = mtodo(m_head, ehlen + iphlen);
886	if (th->th_flags & TH_SYN)
887		*tcpsyn = 1;
888	return (m_head);
889}
890
891#undef PULLUP_HDR
892
893#endif	/* INET6 || INET */
894
895static int
896hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
897{
898	int error = 0;
899
900	HN_LOCK_ASSERT(sc);
901
902	if (sc->hn_rx_filter != filter) {
903		error = hn_rndis_set_rxfilter(sc, filter);
904		if (!error)
905			sc->hn_rx_filter = filter;
906	}
907	return (error);
908}
909
910static int
911hn_rxfilter_config(struct hn_softc *sc)
912{
913	struct ifnet *ifp = sc->hn_ifp;
914	uint32_t filter;
915
916	HN_LOCK_ASSERT(sc);
917
918	/*
919	 * If the non-transparent mode VF is activated, we don't know how
920	 * its RX filter is configured, so stick the synthetic device in
921	 * the promiscous mode.
922	 */
923	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
924		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
925	} else {
926		filter = NDIS_PACKET_TYPE_DIRECTED;
927		if (ifp->if_flags & IFF_BROADCAST)
928			filter |= NDIS_PACKET_TYPE_BROADCAST;
929		/* TODO: support multicast list */
930		if ((ifp->if_flags & IFF_ALLMULTI) ||
931		    !TAILQ_EMPTY(&ifp->if_multiaddrs))
932			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
933	}
934	return (hn_set_rxfilter(sc, filter));
935}
936
937static void
938hn_set_txagg(struct hn_softc *sc)
939{
940	uint32_t size, pkts;
941	int i;
942
943	/*
944	 * Setup aggregation size.
945	 */
946	if (sc->hn_agg_size < 0)
947		size = UINT32_MAX;
948	else
949		size = sc->hn_agg_size;
950
951	if (sc->hn_rndis_agg_size < size)
952		size = sc->hn_rndis_agg_size;
953
954	/* NOTE: We only aggregate packets using chimney sending buffers. */
955	if (size > (uint32_t)sc->hn_chim_szmax)
956		size = sc->hn_chim_szmax;
957
958	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
959		/* Disable */
960		size = 0;
961		pkts = 0;
962		goto done;
963	}
964
965	/* NOTE: Type of the per TX ring setting is 'int'. */
966	if (size > INT_MAX)
967		size = INT_MAX;
968
969	/*
970	 * Setup aggregation packet count.
971	 */
972	if (sc->hn_agg_pkts < 0)
973		pkts = UINT32_MAX;
974	else
975		pkts = sc->hn_agg_pkts;
976
977	if (sc->hn_rndis_agg_pkts < pkts)
978		pkts = sc->hn_rndis_agg_pkts;
979
980	if (pkts <= 1) {
981		/* Disable */
982		size = 0;
983		pkts = 0;
984		goto done;
985	}
986
987	/* NOTE: Type of the per TX ring setting is 'short'. */
988	if (pkts > SHRT_MAX)
989		pkts = SHRT_MAX;
990
991done:
992	/* NOTE: Type of the per TX ring setting is 'short'. */
993	if (sc->hn_rndis_agg_align > SHRT_MAX) {
994		/* Disable */
995		size = 0;
996		pkts = 0;
997	}
998
999	if (bootverbose) {
1000		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1001		    size, pkts, sc->hn_rndis_agg_align);
1002	}
1003
1004	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1005		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1006
1007		mtx_lock(&txr->hn_tx_lock);
1008		txr->hn_agg_szmax = size;
1009		txr->hn_agg_pktmax = pkts;
1010		txr->hn_agg_align = sc->hn_rndis_agg_align;
1011		mtx_unlock(&txr->hn_tx_lock);
1012	}
1013}
1014
1015static int
1016hn_get_txswq_depth(const struct hn_tx_ring *txr)
1017{
1018
1019	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1020	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1021		return txr->hn_txdesc_cnt;
1022	return hn_tx_swq_depth;
1023}
1024
1025static int
1026hn_rss_reconfig(struct hn_softc *sc)
1027{
1028	int error;
1029
1030	HN_LOCK_ASSERT(sc);
1031
1032	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1033		return (ENXIO);
1034
1035	/*
1036	 * Disable RSS first.
1037	 *
1038	 * NOTE:
1039	 * Direct reconfiguration by setting the UNCHG flags does
1040	 * _not_ work properly.
1041	 */
1042	if (bootverbose)
1043		if_printf(sc->hn_ifp, "disable RSS\n");
1044	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1045	if (error) {
1046		if_printf(sc->hn_ifp, "RSS disable failed\n");
1047		return (error);
1048	}
1049
1050	/*
1051	 * Reenable the RSS w/ the updated RSS key or indirect
1052	 * table.
1053	 */
1054	if (bootverbose)
1055		if_printf(sc->hn_ifp, "reconfig RSS\n");
1056	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1057	if (error) {
1058		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1059		return (error);
1060	}
1061	return (0);
1062}
1063
1064static void
1065hn_rss_ind_fixup(struct hn_softc *sc)
1066{
1067	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1068	int i, nchan;
1069
1070	nchan = sc->hn_rx_ring_inuse;
1071	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1072
1073	/*
1074	 * Check indirect table to make sure that all channels in it
1075	 * can be used.
1076	 */
1077	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1078		if (rss->rss_ind[i] >= nchan) {
1079			if_printf(sc->hn_ifp,
1080			    "RSS indirect table %d fixup: %u -> %d\n",
1081			    i, rss->rss_ind[i], nchan - 1);
1082			rss->rss_ind[i] = nchan - 1;
1083		}
1084	}
1085}
1086
1087static int
1088hn_ifmedia_upd(struct ifnet *ifp __unused)
1089{
1090
1091	return EOPNOTSUPP;
1092}
1093
1094static void
1095hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1096{
1097	struct hn_softc *sc = ifp->if_softc;
1098
1099	ifmr->ifm_status = IFM_AVALID;
1100	ifmr->ifm_active = IFM_ETHER;
1101
1102	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1103		ifmr->ifm_active |= IFM_NONE;
1104		return;
1105	}
1106	ifmr->ifm_status |= IFM_ACTIVE;
1107	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1108}
1109
1110static void
1111hn_rxvf_set_task(void *xarg, int pending __unused)
1112{
1113	struct hn_rxvf_setarg *arg = xarg;
1114
1115	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1116}
1117
1118static void
1119hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1120{
1121	struct hn_rx_ring *rxr;
1122	struct hn_rxvf_setarg arg;
1123	struct task task;
1124	int i;
1125
1126	HN_LOCK_ASSERT(sc);
1127
1128	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1129
1130	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1131		rxr = &sc->hn_rx_ring[i];
1132
1133		if (i < sc->hn_rx_ring_inuse) {
1134			arg.rxr = rxr;
1135			arg.vf_ifp = vf_ifp;
1136			vmbus_chan_run_task(rxr->hn_chan, &task);
1137		} else {
1138			rxr->hn_rxvf_ifp = vf_ifp;
1139		}
1140	}
1141}
1142
1143static bool
1144hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1145{
1146	const struct ifnet *hn_ifp;
1147
1148	hn_ifp = sc->hn_ifp;
1149
1150	if (ifp == hn_ifp)
1151		return (false);
1152
1153	if (ifp->if_alloctype != IFT_ETHER)
1154		return (false);
1155
1156	/* Ignore lagg/vlan interfaces */
1157	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1158	    strcmp(ifp->if_dname, "vlan") == 0)
1159		return (false);
1160
1161	/*
1162	 * During detach events ifp->if_addr might be NULL.
1163	 * Make sure the bcmp() below doesn't panic on that:
1164	 */
1165	if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1166		return (false);
1167
1168	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1169		return (false);
1170
1171	return (true);
1172}
1173
1174static void
1175hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1176{
1177	struct ifnet *hn_ifp;
1178
1179	HN_LOCK(sc);
1180
1181	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1182		goto out;
1183
1184	if (!hn_ismyvf(sc, ifp))
1185		goto out;
1186	hn_ifp = sc->hn_ifp;
1187
1188	if (rxvf) {
1189		if (sc->hn_flags & HN_FLAG_RXVF)
1190			goto out;
1191
1192		sc->hn_flags |= HN_FLAG_RXVF;
1193		hn_rxfilter_config(sc);
1194	} else {
1195		if (!(sc->hn_flags & HN_FLAG_RXVF))
1196			goto out;
1197
1198		sc->hn_flags &= ~HN_FLAG_RXVF;
1199		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1200			hn_rxfilter_config(sc);
1201		else
1202			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1203	}
1204
1205	hn_nvs_set_datapath(sc,
1206	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1207
1208	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1209
1210	if (rxvf) {
1211		hn_vf_rss_fixup(sc, true);
1212		hn_suspend_mgmt(sc);
1213		sc->hn_link_flags &=
1214		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1215		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1216	} else {
1217		hn_vf_rss_restore(sc);
1218		hn_resume_mgmt(sc);
1219	}
1220
1221	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1222	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1223
1224	if (bootverbose) {
1225		if_printf(hn_ifp, "datapath is switched %s %s\n",
1226		    rxvf ? "to" : "from", ifp->if_xname);
1227	}
1228out:
1229	HN_UNLOCK(sc);
1230}
1231
1232static void
1233hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1234{
1235
1236	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1237		return;
1238	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1239}
1240
1241static void
1242hn_ifaddr_event(void *arg, struct ifnet *ifp)
1243{
1244
1245	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1246}
1247
1248static int
1249hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1250{
1251	struct ifnet *ifp, *vf_ifp;
1252	uint64_t tmp;
1253	int error;
1254
1255	HN_LOCK_ASSERT(sc);
1256	ifp = sc->hn_ifp;
1257	vf_ifp = sc->hn_vf_ifp;
1258
1259	/*
1260	 * Fix up requested capabilities w/ supported capabilities,
1261	 * since the supported capabilities could have been changed.
1262	 */
1263	ifr->ifr_reqcap &= ifp->if_capabilities;
1264	/* Pass SIOCSIFCAP to VF. */
1265	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1266
1267	/*
1268	 * NOTE:
1269	 * The error will be propagated to the callers, however, it
1270	 * is _not_ useful here.
1271	 */
1272
1273	/*
1274	 * Merge VF's enabled capabilities.
1275	 */
1276	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1277
1278	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1279	if (ifp->if_capenable & IFCAP_TXCSUM)
1280		ifp->if_hwassist |= tmp;
1281	else
1282		ifp->if_hwassist &= ~tmp;
1283
1284	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1285	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1286		ifp->if_hwassist |= tmp;
1287	else
1288		ifp->if_hwassist &= ~tmp;
1289
1290	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1291	if (ifp->if_capenable & IFCAP_TSO4)
1292		ifp->if_hwassist |= tmp;
1293	else
1294		ifp->if_hwassist &= ~tmp;
1295
1296	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1297	if (ifp->if_capenable & IFCAP_TSO6)
1298		ifp->if_hwassist |= tmp;
1299	else
1300		ifp->if_hwassist &= ~tmp;
1301
1302	return (error);
1303}
1304
1305static int
1306hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1307{
1308	struct ifnet *vf_ifp;
1309	struct ifreq ifr;
1310
1311	HN_LOCK_ASSERT(sc);
1312	vf_ifp = sc->hn_vf_ifp;
1313
1314	memset(&ifr, 0, sizeof(ifr));
1315	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1316	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1317	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1318	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1319}
1320
1321static void
1322hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1323{
1324	struct ifnet *ifp = sc->hn_ifp;
1325	int allmulti = 0;
1326
1327	HN_LOCK_ASSERT(sc);
1328
1329	/* XXX vlan(4) style mcast addr maintenance */
1330	if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1331		allmulti = IFF_ALLMULTI;
1332
1333	/* Always set the VF's if_flags */
1334	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1335}
1336
1337static void
1338hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1339{
1340	struct rm_priotracker pt;
1341	struct ifnet *hn_ifp = NULL;
1342	struct mbuf *mn;
1343
1344	/*
1345	 * XXX racy, if hn(4) ever detached.
1346	 */
1347	rm_rlock(&hn_vfmap_lock, &pt);
1348	if (vf_ifp->if_index < hn_vfmap_size)
1349		hn_ifp = hn_vfmap[vf_ifp->if_index];
1350	rm_runlock(&hn_vfmap_lock, &pt);
1351
1352	if (hn_ifp != NULL) {
1353		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1354			/*
1355			 * Allow tapping on the VF.
1356			 */
1357			ETHER_BPF_MTAP(vf_ifp, mn);
1358
1359			/*
1360			 * Update VF stats.
1361			 */
1362			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1363				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1364				    mn->m_pkthdr.len);
1365			}
1366			/*
1367			 * XXX IFCOUNTER_IMCAST
1368			 * This stat updating is kinda invasive, since it
1369			 * requires two checks on the mbuf: the length check
1370			 * and the ethernet header check.  As of this write,
1371			 * all multicast packets go directly to hn(4), which
1372			 * makes imcast stat updating in the VF a try in vian.
1373			 */
1374
1375			/*
1376			 * Fix up rcvif and increase hn(4)'s ipackets.
1377			 */
1378			mn->m_pkthdr.rcvif = hn_ifp;
1379			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1380		}
1381		/*
1382		 * Go through hn(4)'s if_input.
1383		 */
1384		hn_ifp->if_input(hn_ifp, m);
1385	} else {
1386		/*
1387		 * In the middle of the transition; free this
1388		 * mbuf chain.
1389		 */
1390		while (m != NULL) {
1391			mn = m->m_nextpkt;
1392			m->m_nextpkt = NULL;
1393			m_freem(m);
1394			m = mn;
1395		}
1396	}
1397}
1398
1399static void
1400hn_mtu_change_fixup(struct hn_softc *sc)
1401{
1402	struct ifnet *ifp;
1403
1404	HN_LOCK_ASSERT(sc);
1405	ifp = sc->hn_ifp;
1406
1407	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1408#if __FreeBSD_version >= 1100099
1409	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1410		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1411#endif
1412}
1413
1414static uint32_t
1415hn_rss_type_fromndis(uint32_t rss_hash)
1416{
1417	uint32_t types = 0;
1418
1419	if (rss_hash & NDIS_HASH_IPV4)
1420		types |= RSS_TYPE_IPV4;
1421	if (rss_hash & NDIS_HASH_TCP_IPV4)
1422		types |= RSS_TYPE_TCP_IPV4;
1423	if (rss_hash & NDIS_HASH_IPV6)
1424		types |= RSS_TYPE_IPV6;
1425	if (rss_hash & NDIS_HASH_IPV6_EX)
1426		types |= RSS_TYPE_IPV6_EX;
1427	if (rss_hash & NDIS_HASH_TCP_IPV6)
1428		types |= RSS_TYPE_TCP_IPV6;
1429	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1430		types |= RSS_TYPE_TCP_IPV6_EX;
1431	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1432		types |= RSS_TYPE_UDP_IPV4;
1433	return (types);
1434}
1435
1436static uint32_t
1437hn_rss_type_tondis(uint32_t types)
1438{
1439	uint32_t rss_hash = 0;
1440
1441	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1442	    ("UDP6 and UDP6EX are not supported"));
1443
1444	if (types & RSS_TYPE_IPV4)
1445		rss_hash |= NDIS_HASH_IPV4;
1446	if (types & RSS_TYPE_TCP_IPV4)
1447		rss_hash |= NDIS_HASH_TCP_IPV4;
1448	if (types & RSS_TYPE_IPV6)
1449		rss_hash |= NDIS_HASH_IPV6;
1450	if (types & RSS_TYPE_IPV6_EX)
1451		rss_hash |= NDIS_HASH_IPV6_EX;
1452	if (types & RSS_TYPE_TCP_IPV6)
1453		rss_hash |= NDIS_HASH_TCP_IPV6;
1454	if (types & RSS_TYPE_TCP_IPV6_EX)
1455		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1456	if (types & RSS_TYPE_UDP_IPV4)
1457		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1458	return (rss_hash);
1459}
1460
1461static void
1462hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1463{
1464	int i;
1465
1466	HN_LOCK_ASSERT(sc);
1467
1468	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1469		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1470}
1471
1472static void
1473hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1474{
1475	struct ifnet *ifp, *vf_ifp;
1476	struct ifrsshash ifrh;
1477	struct ifrsskey ifrk;
1478	int error;
1479	uint32_t my_types, diff_types, mbuf_types = 0;
1480
1481	HN_LOCK_ASSERT(sc);
1482	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1483	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1484
1485	if (sc->hn_rx_ring_inuse == 1) {
1486		/* No RSS on synthetic parts; done. */
1487		return;
1488	}
1489	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1490		/* Synthetic parts do not support Toeplitz; done. */
1491		return;
1492	}
1493
1494	ifp = sc->hn_ifp;
1495	vf_ifp = sc->hn_vf_ifp;
1496
1497	/*
1498	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1499	 * supported.
1500	 */
1501	memset(&ifrk, 0, sizeof(ifrk));
1502	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1503	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1504	if (error) {
1505		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1506		    vf_ifp->if_xname, error);
1507		goto done;
1508	}
1509	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1510		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1511		    vf_ifp->if_xname, ifrk.ifrk_func);
1512		goto done;
1513	}
1514	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1515		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1516		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1517		goto done;
1518	}
1519
1520	/*
1521	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1522	 */
1523	memset(&ifrh, 0, sizeof(ifrh));
1524	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1525	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1526	if (error) {
1527		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1528		    vf_ifp->if_xname, error);
1529		goto done;
1530	}
1531	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1532		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1533		    vf_ifp->if_xname, ifrh.ifrh_func);
1534		goto done;
1535	}
1536
1537	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1538	if ((ifrh.ifrh_types & my_types) == 0) {
1539		/* This disables RSS; ignore it then */
1540		if_printf(ifp, "%s intersection of RSS types failed.  "
1541		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1542		    ifrh.ifrh_types, my_types);
1543		goto done;
1544	}
1545
1546	diff_types = my_types ^ ifrh.ifrh_types;
1547	my_types &= ifrh.ifrh_types;
1548	mbuf_types = my_types;
1549
1550	/*
1551	 * Detect RSS hash value/type confliction.
1552	 *
1553	 * NOTE:
1554	 * We don't disable the hash type, but stop delivery the hash
1555	 * value/type through mbufs on RX path.
1556	 *
1557	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1558	 * hash is delivered with type of TCP_IPV4.  This means if
1559	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1560	 * least to hn_mbuf_hash.  However, given that _all_ of the
1561	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1562	 * here.
1563	 */
1564	if ((my_types & RSS_TYPE_IPV4) &&
1565	    (diff_types & ifrh.ifrh_types &
1566	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1567		/* Conflict; disable IPV4 hash type/value delivery. */
1568		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1569		mbuf_types &= ~RSS_TYPE_IPV4;
1570	}
1571	if ((my_types & RSS_TYPE_IPV6) &&
1572	    (diff_types & ifrh.ifrh_types &
1573	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1574	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1575	      RSS_TYPE_IPV6_EX))) {
1576		/* Conflict; disable IPV6 hash type/value delivery. */
1577		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1578		mbuf_types &= ~RSS_TYPE_IPV6;
1579	}
1580	if ((my_types & RSS_TYPE_IPV6_EX) &&
1581	    (diff_types & ifrh.ifrh_types &
1582	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1583	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1584	      RSS_TYPE_IPV6))) {
1585		/* Conflict; disable IPV6_EX hash type/value delivery. */
1586		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1587		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1588	}
1589	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1590	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1591		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1592		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1593		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1594	}
1595	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1596	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1597		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1598		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1599		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1600	}
1601	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1602	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1603		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1604		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1605		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1606	}
1607	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1608	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1609		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1610		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1611		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1612	}
1613
1614	/*
1615	 * Indirect table does not matter.
1616	 */
1617
1618	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1619	    hn_rss_type_tondis(my_types);
1620	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1621	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1622
1623	if (reconf) {
1624		error = hn_rss_reconfig(sc);
1625		if (error) {
1626			/* XXX roll-back? */
1627			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1628			/* XXX keep going. */
1629		}
1630	}
1631done:
1632	/* Hash deliverability for mbufs. */
1633	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1634}
1635
1636static void
1637hn_vf_rss_restore(struct hn_softc *sc)
1638{
1639
1640	HN_LOCK_ASSERT(sc);
1641	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1642	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1643
1644	if (sc->hn_rx_ring_inuse == 1)
1645		goto done;
1646
1647	/*
1648	 * Restore hash types.  Key does _not_ matter.
1649	 */
1650	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1651		int error;
1652
1653		sc->hn_rss_hash = sc->hn_rss_hcap;
1654		error = hn_rss_reconfig(sc);
1655		if (error) {
1656			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1657			    error);
1658			/* XXX keep going. */
1659		}
1660	}
1661done:
1662	/* Hash deliverability for mbufs. */
1663	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1664}
1665
1666static void
1667hn_xpnt_vf_setready(struct hn_softc *sc)
1668{
1669	struct ifnet *ifp, *vf_ifp;
1670	struct ifreq ifr;
1671
1672	HN_LOCK_ASSERT(sc);
1673	ifp = sc->hn_ifp;
1674	vf_ifp = sc->hn_vf_ifp;
1675
1676	/*
1677	 * Mark the VF ready.
1678	 */
1679	sc->hn_vf_rdytick = 0;
1680
1681	/*
1682	 * Save information for restoration.
1683	 */
1684	sc->hn_saved_caps = ifp->if_capabilities;
1685	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1686	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1687	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1688
1689	/*
1690	 * Intersect supported/enabled capabilities.
1691	 *
1692	 * NOTE:
1693	 * if_hwassist is not changed here.
1694	 */
1695	ifp->if_capabilities &= vf_ifp->if_capabilities;
1696	ifp->if_capenable &= ifp->if_capabilities;
1697
1698	/*
1699	 * Fix TSO settings.
1700	 */
1701	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1702		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1703	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1704		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1705	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1706		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1707
1708	/*
1709	 * Change VF's enabled capabilities.
1710	 */
1711	memset(&ifr, 0, sizeof(ifr));
1712	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1713	ifr.ifr_reqcap = ifp->if_capenable;
1714	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1715
1716	if (ifp->if_mtu != ETHERMTU) {
1717		int error;
1718
1719		/*
1720		 * Change VF's MTU.
1721		 */
1722		memset(&ifr, 0, sizeof(ifr));
1723		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1724		ifr.ifr_mtu = ifp->if_mtu;
1725		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1726		if (error) {
1727			if_printf(ifp, "%s SIOCSIFMTU %lu failed\n",
1728			    vf_ifp->if_xname, ifp->if_mtu);
1729			if (ifp->if_mtu > ETHERMTU) {
1730				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1731
1732				/*
1733				 * XXX
1734				 * No need to adjust the synthetic parts' MTU;
1735				 * failure of the adjustment will cause us
1736				 * infinite headache.
1737				 */
1738				ifp->if_mtu = ETHERMTU;
1739				hn_mtu_change_fixup(sc);
1740			}
1741		}
1742	}
1743}
1744
1745static bool
1746hn_xpnt_vf_isready(struct hn_softc *sc)
1747{
1748
1749	HN_LOCK_ASSERT(sc);
1750
1751	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1752		return (false);
1753
1754	if (sc->hn_vf_rdytick == 0)
1755		return (true);
1756
1757	if (sc->hn_vf_rdytick > ticks)
1758		return (false);
1759
1760	/* Mark VF as ready. */
1761	hn_xpnt_vf_setready(sc);
1762	return (true);
1763}
1764
1765static void
1766hn_xpnt_vf_setenable(struct hn_softc *sc)
1767{
1768	int i;
1769
1770	HN_LOCK_ASSERT(sc);
1771
1772	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1773	rm_wlock(&sc->hn_vf_lock);
1774	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1775	rm_wunlock(&sc->hn_vf_lock);
1776
1777	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1778		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1779}
1780
1781static void
1782hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1783{
1784	int i;
1785
1786	HN_LOCK_ASSERT(sc);
1787
1788	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1789	rm_wlock(&sc->hn_vf_lock);
1790	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1791	if (clear_vf)
1792		sc->hn_vf_ifp = NULL;
1793	rm_wunlock(&sc->hn_vf_lock);
1794
1795	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1796		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1797}
1798
1799static void
1800hn_xpnt_vf_init(struct hn_softc *sc)
1801{
1802	int error;
1803
1804	HN_LOCK_ASSERT(sc);
1805
1806	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1807	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1808
1809	if (bootverbose) {
1810		if_printf(sc->hn_ifp, "try bringing up %s\n",
1811		    sc->hn_vf_ifp->if_xname);
1812	}
1813
1814	/*
1815	 * Bring the VF up.
1816	 */
1817	hn_xpnt_vf_saveifflags(sc);
1818	sc->hn_vf_ifp->if_flags |= IFF_UP;
1819	error = hn_xpnt_vf_iocsetflags(sc);
1820	if (error) {
1821		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1822		    sc->hn_vf_ifp->if_xname, error);
1823		return;
1824	}
1825
1826	/*
1827	 * NOTE:
1828	 * Datapath setting must happen _after_ bringing the VF up.
1829	 */
1830	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1831
1832	/*
1833	 * NOTE:
1834	 * Fixup RSS related bits _after_ the VF is brought up, since
1835	 * many VFs generate RSS key during it's initialization.
1836	 */
1837	hn_vf_rss_fixup(sc, true);
1838
1839	/* Mark transparent mode VF as enabled. */
1840	hn_xpnt_vf_setenable(sc);
1841}
1842
1843static void
1844hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1845{
1846	struct hn_softc *sc = xsc;
1847
1848	HN_LOCK(sc);
1849
1850	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1851		goto done;
1852	if (sc->hn_vf_ifp == NULL)
1853		goto done;
1854	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1855		goto done;
1856
1857	if (sc->hn_vf_rdytick != 0) {
1858		/* Mark VF as ready. */
1859		hn_xpnt_vf_setready(sc);
1860	}
1861
1862	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1863		/*
1864		 * Delayed VF initialization.
1865		 */
1866		if (bootverbose) {
1867			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1868			    sc->hn_vf_ifp->if_xname);
1869		}
1870		hn_xpnt_vf_init(sc);
1871	}
1872done:
1873	HN_UNLOCK(sc);
1874}
1875
1876static void
1877hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1878{
1879	struct hn_softc *sc = xsc;
1880
1881	HN_LOCK(sc);
1882
1883	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1884		goto done;
1885
1886	if (!hn_ismyvf(sc, ifp))
1887		goto done;
1888
1889	if (sc->hn_vf_ifp != NULL) {
1890		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1891		    sc->hn_vf_ifp->if_xname);
1892		goto done;
1893	}
1894
1895	if (hn_xpnt_vf && ifp->if_start != NULL) {
1896		/*
1897		 * ifnet.if_start is _not_ supported by transparent
1898		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1899		 */
1900		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1901		    "in transparent VF mode.\n", ifp->if_xname);
1902		goto done;
1903	}
1904
1905	rm_wlock(&hn_vfmap_lock);
1906
1907	if (ifp->if_index >= hn_vfmap_size) {
1908		struct ifnet **newmap;
1909		int newsize;
1910
1911		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1912		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1913		    M_WAITOK | M_ZERO);
1914
1915		memcpy(newmap, hn_vfmap,
1916		    sizeof(struct ifnet *) * hn_vfmap_size);
1917		free(hn_vfmap, M_DEVBUF);
1918		hn_vfmap = newmap;
1919		hn_vfmap_size = newsize;
1920	}
1921	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1922	    ("%s: ifindex %d was mapped to %s",
1923	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1924	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1925
1926	rm_wunlock(&hn_vfmap_lock);
1927
1928	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1929	rm_wlock(&sc->hn_vf_lock);
1930	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1931	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1932	sc->hn_vf_ifp = ifp;
1933	rm_wunlock(&sc->hn_vf_lock);
1934
1935	if (hn_xpnt_vf) {
1936		int wait_ticks;
1937
1938		/*
1939		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1940		 * Save vf_ifp's current if_input for later restoration.
1941		 */
1942		sc->hn_vf_input = ifp->if_input;
1943		ifp->if_input = hn_xpnt_vf_input;
1944
1945		/*
1946		 * Stop link status management; use the VF's.
1947		 */
1948		hn_suspend_mgmt(sc);
1949
1950		/*
1951		 * Give VF sometime to complete its attach routing.
1952		 */
1953		wait_ticks = hn_xpnt_vf_attwait * hz;
1954		sc->hn_vf_rdytick = ticks + wait_ticks;
1955
1956		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1957		    wait_ticks);
1958	}
1959done:
1960	HN_UNLOCK(sc);
1961}
1962
1963static void
1964hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1965{
1966	struct hn_softc *sc = xsc;
1967
1968	HN_LOCK(sc);
1969
1970	if (sc->hn_vf_ifp == NULL)
1971		goto done;
1972
1973	if (!hn_ismyvf(sc, ifp))
1974		goto done;
1975
1976	if (hn_xpnt_vf) {
1977		/*
1978		 * Make sure that the delayed initialization is not running.
1979		 *
1980		 * NOTE:
1981		 * - This lock _must_ be released, since the hn_vf_init task
1982		 *   will try holding this lock.
1983		 * - It is safe to release this lock here, since the
1984		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1985		 *
1986		 * XXX racy, if hn(4) ever detached.
1987		 */
1988		HN_UNLOCK(sc);
1989		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1990		HN_LOCK(sc);
1991
1992		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1993		    sc->hn_ifp->if_xname));
1994		ifp->if_input = sc->hn_vf_input;
1995		sc->hn_vf_input = NULL;
1996
1997		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1998		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1999			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2000
2001		if (sc->hn_vf_rdytick == 0) {
2002			/*
2003			 * The VF was ready; restore some settings.
2004			 */
2005			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2006			/*
2007			 * NOTE:
2008			 * There is _no_ need to fixup if_capenable and
2009			 * if_hwassist, since the if_capabilities before
2010			 * restoration was an intersection of the VF's
2011			 * if_capabilites and the synthetic device's
2012			 * if_capabilites.
2013			 */
2014			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2015			sc->hn_ifp->if_hw_tsomaxsegcount =
2016			    sc->hn_saved_tsosegcnt;
2017			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2018		}
2019
2020		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2021			/*
2022			 * Restore RSS settings.
2023			 */
2024			hn_vf_rss_restore(sc);
2025
2026			/*
2027			 * Resume link status management, which was suspended
2028			 * by hn_ifnet_attevent().
2029			 */
2030			hn_resume_mgmt(sc);
2031		}
2032	}
2033
2034	/* Mark transparent mode VF as disabled. */
2035	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2036
2037	rm_wlock(&hn_vfmap_lock);
2038
2039	KASSERT(ifp->if_index < hn_vfmap_size,
2040	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2041	if (hn_vfmap[ifp->if_index] != NULL) {
2042		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2043		    ("%s: ifindex %d was mapped to %s",
2044		     ifp->if_xname, ifp->if_index,
2045		     hn_vfmap[ifp->if_index]->if_xname));
2046		hn_vfmap[ifp->if_index] = NULL;
2047	}
2048
2049	rm_wunlock(&hn_vfmap_lock);
2050done:
2051	HN_UNLOCK(sc);
2052}
2053
2054static void
2055hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2056{
2057	struct hn_softc *sc = xsc;
2058
2059	if (sc->hn_vf_ifp == ifp)
2060		if_link_state_change(sc->hn_ifp, link_state);
2061}
2062
2063static int
2064hn_probe(device_t dev)
2065{
2066
2067	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2068		device_set_desc(dev, "Hyper-V Network Interface");
2069		return BUS_PROBE_DEFAULT;
2070	}
2071	return ENXIO;
2072}
2073
2074static int
2075hn_attach(device_t dev)
2076{
2077	struct hn_softc *sc = device_get_softc(dev);
2078	struct sysctl_oid_list *child;
2079	struct sysctl_ctx_list *ctx;
2080	uint8_t eaddr[ETHER_ADDR_LEN];
2081	struct ifnet *ifp = NULL;
2082	int error, ring_cnt, tx_ring_cnt;
2083	uint32_t mtu;
2084
2085	sc->hn_dev = dev;
2086	sc->hn_prichan = vmbus_get_channel(dev);
2087	HN_LOCK_INIT(sc);
2088	rm_init(&sc->hn_vf_lock, "hnvf");
2089	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2090		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2091
2092	/*
2093	 * Initialize these tunables once.
2094	 */
2095	sc->hn_agg_size = hn_tx_agg_size;
2096	sc->hn_agg_pkts = hn_tx_agg_pkts;
2097
2098	/*
2099	 * Setup taskqueue for transmission.
2100	 */
2101	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2102		int i;
2103
2104		sc->hn_tx_taskqs =
2105		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2106		    M_DEVBUF, M_WAITOK);
2107		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2108			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2109			    M_WAITOK, taskqueue_thread_enqueue,
2110			    &sc->hn_tx_taskqs[i]);
2111			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2112			    "%s tx%d", device_get_nameunit(dev), i);
2113		}
2114	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2115		sc->hn_tx_taskqs = hn_tx_taskque;
2116	}
2117
2118	/*
2119	 * Setup taskqueue for mangement tasks, e.g. link status.
2120	 */
2121	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2122	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2123	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2124	    device_get_nameunit(dev));
2125	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2126	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2127	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2128	    hn_netchg_status_taskfunc, sc);
2129
2130	if (hn_xpnt_vf) {
2131		/*
2132		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2133		 */
2134		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2135		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2136		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2137		    device_get_nameunit(dev));
2138		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2139		    hn_xpnt_vf_init_taskfunc, sc);
2140	}
2141
2142	/*
2143	 * Allocate ifnet and setup its name earlier, so that if_printf
2144	 * can be used by functions, which will be called after
2145	 * ether_ifattach().
2146	 */
2147	ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
2148	ifp->if_softc = sc;
2149	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2150
2151	/*
2152	 * Initialize ifmedia earlier so that it can be unconditionally
2153	 * destroyed, if error happened later on.
2154	 */
2155	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2156
2157	/*
2158	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2159	 * to use (tx_ring_cnt).
2160	 *
2161	 * NOTE:
2162	 * The # of RX rings to use is same as the # of channels to use.
2163	 */
2164	ring_cnt = hn_chan_cnt;
2165	if (ring_cnt <= 0) {
2166		/* Default */
2167		ring_cnt = mp_ncpus;
2168		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2169			ring_cnt = HN_RING_CNT_DEF_MAX;
2170	} else if (ring_cnt > mp_ncpus) {
2171		ring_cnt = mp_ncpus;
2172	}
2173
2174	tx_ring_cnt = hn_tx_ring_cnt;
2175	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2176		tx_ring_cnt = ring_cnt;
2177#ifdef HN_IFSTART_SUPPORT
2178	if (hn_use_if_start) {
2179		/* ifnet.if_start only needs one TX ring. */
2180		tx_ring_cnt = 1;
2181	}
2182#endif
2183
2184	/*
2185	 * Set the leader CPU for channels.
2186	 */
2187	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2188
2189	/*
2190	 * Create enough TX/RX rings, even if only limited number of
2191	 * channels can be allocated.
2192	 */
2193	error = hn_create_tx_data(sc, tx_ring_cnt);
2194	if (error)
2195		goto failed;
2196	error = hn_create_rx_data(sc, ring_cnt);
2197	if (error)
2198		goto failed;
2199
2200	/*
2201	 * Create transaction context for NVS and RNDIS transactions.
2202	 */
2203	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2204	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2205	if (sc->hn_xact == NULL) {
2206		error = ENXIO;
2207		goto failed;
2208	}
2209
2210	/*
2211	 * Install orphan handler for the revocation of this device's
2212	 * primary channel.
2213	 *
2214	 * NOTE:
2215	 * The processing order is critical here:
2216	 * Install the orphan handler, _before_ testing whether this
2217	 * device's primary channel has been revoked or not.
2218	 */
2219	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2220	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2221		error = ENXIO;
2222		goto failed;
2223	}
2224
2225	/*
2226	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2227	 */
2228	error = hn_synth_attach(sc, ETHERMTU);
2229	if (error)
2230		goto failed;
2231
2232	error = hn_rndis_get_eaddr(sc, eaddr);
2233	if (error)
2234		goto failed;
2235
2236	error = hn_rndis_get_mtu(sc, &mtu);
2237	if (error)
2238		mtu = ETHERMTU;
2239	else if (bootverbose)
2240		device_printf(dev, "RNDIS mtu %u\n", mtu);
2241
2242#if __FreeBSD_version >= 1100099
2243	if (sc->hn_rx_ring_inuse > 1) {
2244		/*
2245		 * Reduce TCP segment aggregation limit for multiple
2246		 * RX rings to increase ACK timeliness.
2247		 */
2248		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2249	}
2250#endif
2251
2252	/*
2253	 * Fixup TX/RX stuffs after synthetic parts are attached.
2254	 */
2255	hn_fixup_tx_data(sc);
2256	hn_fixup_rx_data(sc);
2257
2258	ctx = device_get_sysctl_ctx(dev);
2259	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2260	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2261	    &sc->hn_nvs_ver, 0, "NVS version");
2262	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2263	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2264	    hn_ndis_version_sysctl, "A", "NDIS version");
2265	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2266	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2267	    hn_caps_sysctl, "A", "capabilities");
2268	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2269	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2270	    hn_hwassist_sysctl, "A", "hwassist");
2271	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2272	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2273	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2274	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2275	    "max # of TSO segments");
2276	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2277	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2278	    "max size of TSO segment");
2279	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2280	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2281	    hn_rxfilter_sysctl, "A", "rxfilter");
2282	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2283	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2284	    hn_rss_hash_sysctl, "A", "RSS hash");
2285	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2286	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2287	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2288	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2289	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2290	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2291	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2292	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2293	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2294	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2295	    hn_rss_key_sysctl, "IU", "RSS key");
2296	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2297	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2298	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2299	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2300	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2301	    "RNDIS offered packet transmission aggregation size limit");
2302	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2303	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2304	    "RNDIS offered packet transmission aggregation count limit");
2305	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2306	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2307	    "RNDIS packet transmission aggregation alignment");
2308	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2309	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2310	    hn_txagg_size_sysctl, "I",
2311	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2312	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2313	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2314	    hn_txagg_pkts_sysctl, "I",
2315	    "Packet transmission aggregation packets, "
2316	    "0 -- disable, -1 -- auto");
2317	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2318	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2319	    hn_polling_sysctl, "I",
2320	    "Polling frequency: [100,1000000], 0 disable polling");
2321	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2322	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2323	    hn_vf_sysctl, "A", "Virtual Function's name");
2324	if (!hn_xpnt_vf) {
2325		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2326		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2327		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2328	} else {
2329		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2330		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2331		    hn_xpnt_vf_enabled_sysctl, "I",
2332		    "Transparent VF enabled");
2333		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2334		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2335		    hn_xpnt_vf_accbpf_sysctl, "I",
2336		    "Accurate BPF for transparent VF");
2337	}
2338
2339	/*
2340	 * Setup the ifmedia, which has been initialized earlier.
2341	 */
2342	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2343	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2344	/* XXX ifmedia_set really should do this for us */
2345	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2346
2347	/*
2348	 * Setup the ifnet for this interface.
2349	 */
2350
2351#ifdef __LP64__
2352	ifp->if_baudrate = IF_Gbps(10);
2353#else
2354	/* if_baudrate is 32bits on 32bit system. */
2355	ifp->if_baudrate = IF_Gbps(1);
2356#endif
2357	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2358	ifp->if_ioctl = hn_ioctl;
2359	ifp->if_init = hn_init;
2360#ifdef HN_IFSTART_SUPPORT
2361	if (hn_use_if_start) {
2362		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2363
2364		ifp->if_start = hn_start;
2365		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2366		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2367		IFQ_SET_READY(&ifp->if_snd);
2368	} else
2369#endif
2370	{
2371		ifp->if_transmit = hn_transmit;
2372		ifp->if_qflush = hn_xmit_qflush;
2373	}
2374
2375	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2376#ifdef foo
2377	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2378	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2379#endif
2380	if (sc->hn_caps & HN_CAP_VLAN) {
2381		/* XXX not sure about VLAN_MTU. */
2382		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2383	}
2384
2385	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2386	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2387		ifp->if_capabilities |= IFCAP_TXCSUM;
2388	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2389		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2390	if (sc->hn_caps & HN_CAP_TSO4) {
2391		ifp->if_capabilities |= IFCAP_TSO4;
2392		ifp->if_hwassist |= CSUM_IP_TSO;
2393	}
2394	if (sc->hn_caps & HN_CAP_TSO6) {
2395		ifp->if_capabilities |= IFCAP_TSO6;
2396		ifp->if_hwassist |= CSUM_IP6_TSO;
2397	}
2398
2399	/* Enable all available capabilities by default. */
2400	ifp->if_capenable = ifp->if_capabilities;
2401
2402	/*
2403	 * Disable IPv6 TSO and TXCSUM by default, they still can
2404	 * be enabled through SIOCSIFCAP.
2405	 */
2406	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2407	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2408
2409	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2410		/*
2411		 * Lock hn_set_tso_maxsize() to simplify its
2412		 * internal logic.
2413		 */
2414		HN_LOCK(sc);
2415		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2416		HN_UNLOCK(sc);
2417		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2418		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2419	}
2420
2421	ether_ifattach(ifp, eaddr);
2422
2423	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2424		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2425		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2426	}
2427	if (mtu < ETHERMTU) {
2428		if_printf(ifp, "fixup mtu %lu -> %u\n", ifp->if_mtu, mtu);
2429		ifp->if_mtu = mtu;
2430	}
2431
2432	/* Inform the upper layer about the long frame support. */
2433	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2434
2435	/*
2436	 * Kick off link status check.
2437	 */
2438	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2439	hn_update_link_status(sc);
2440
2441	if (!hn_xpnt_vf) {
2442		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2443		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2444		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2445		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2446	} else {
2447		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2448		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2449	}
2450
2451	/*
2452	 * NOTE:
2453	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2454	 * since interface's LLADDR is needed; interface LLADDR is not
2455	 * available when ifnet_arrival event is triggered.
2456	 */
2457	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2458	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2459	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2460	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2461
2462	return (0);
2463failed:
2464	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2465		hn_synth_detach(sc);
2466	hn_detach(dev);
2467	return (error);
2468}
2469
2470static int
2471hn_detach(device_t dev)
2472{
2473	struct hn_softc *sc = device_get_softc(dev);
2474	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2475
2476	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2477		/*
2478		 * In case that the vmbus missed the orphan handler
2479		 * installation.
2480		 */
2481		vmbus_xact_ctx_orphan(sc->hn_xact);
2482	}
2483
2484	if (sc->hn_ifaddr_evthand != NULL)
2485		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2486	if (sc->hn_ifnet_evthand != NULL)
2487		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2488	if (sc->hn_ifnet_atthand != NULL) {
2489		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2490		    sc->hn_ifnet_atthand);
2491	}
2492	if (sc->hn_ifnet_dethand != NULL) {
2493		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2494		    sc->hn_ifnet_dethand);
2495	}
2496	if (sc->hn_ifnet_lnkhand != NULL)
2497		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2498
2499	vf_ifp = sc->hn_vf_ifp;
2500	__compiler_membar();
2501	if (vf_ifp != NULL)
2502		hn_ifnet_detevent(sc, vf_ifp);
2503
2504	if (device_is_attached(dev)) {
2505		HN_LOCK(sc);
2506		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2507			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2508				hn_stop(sc, true);
2509			/*
2510			 * NOTE:
2511			 * hn_stop() only suspends data, so managment
2512			 * stuffs have to be suspended manually here.
2513			 */
2514			hn_suspend_mgmt(sc);
2515			hn_synth_detach(sc);
2516		}
2517		HN_UNLOCK(sc);
2518		ether_ifdetach(ifp);
2519	}
2520
2521	ifmedia_removeall(&sc->hn_media);
2522	hn_destroy_rx_data(sc);
2523	hn_destroy_tx_data(sc);
2524
2525	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2526		int i;
2527
2528		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2529			taskqueue_free(sc->hn_tx_taskqs[i]);
2530		free(sc->hn_tx_taskqs, M_DEVBUF);
2531	}
2532	taskqueue_free(sc->hn_mgmt_taskq0);
2533	if (sc->hn_vf_taskq != NULL)
2534		taskqueue_free(sc->hn_vf_taskq);
2535
2536	if (sc->hn_xact != NULL) {
2537		/*
2538		 * Uninstall the orphan handler _before_ the xact is
2539		 * destructed.
2540		 */
2541		vmbus_chan_unset_orphan(sc->hn_prichan);
2542		vmbus_xact_ctx_destroy(sc->hn_xact);
2543	}
2544
2545	if_free(ifp);
2546
2547	HN_LOCK_DESTROY(sc);
2548	rm_destroy(&sc->hn_vf_lock);
2549	return (0);
2550}
2551
2552static int
2553hn_shutdown(device_t dev)
2554{
2555
2556	return (0);
2557}
2558
2559static void
2560hn_link_status(struct hn_softc *sc)
2561{
2562	uint32_t link_status;
2563	int error;
2564
2565	error = hn_rndis_get_linkstatus(sc, &link_status);
2566	if (error) {
2567		/* XXX what to do? */
2568		return;
2569	}
2570
2571	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2572		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2573	else
2574		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2575	if_link_state_change(sc->hn_ifp,
2576	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2577	    LINK_STATE_UP : LINK_STATE_DOWN);
2578}
2579
2580static void
2581hn_link_taskfunc(void *xsc, int pending __unused)
2582{
2583	struct hn_softc *sc = xsc;
2584
2585	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2586		return;
2587	hn_link_status(sc);
2588}
2589
2590static void
2591hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2592{
2593	struct hn_softc *sc = xsc;
2594
2595	/* Prevent any link status checks from running. */
2596	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2597
2598	/*
2599	 * Fake up a [link down --> link up] state change; 5 seconds
2600	 * delay is used, which closely simulates miibus reaction
2601	 * upon link down event.
2602	 */
2603	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2604	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2605	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2606	    &sc->hn_netchg_status, 5 * hz);
2607}
2608
2609static void
2610hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2611{
2612	struct hn_softc *sc = xsc;
2613
2614	/* Re-allow link status checks. */
2615	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2616	hn_link_status(sc);
2617}
2618
2619static void
2620hn_update_link_status(struct hn_softc *sc)
2621{
2622
2623	if (sc->hn_mgmt_taskq != NULL)
2624		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2625}
2626
2627static void
2628hn_change_network(struct hn_softc *sc)
2629{
2630
2631	if (sc->hn_mgmt_taskq != NULL)
2632		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2633}
2634
2635static __inline int
2636hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2637    struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2638{
2639	struct mbuf *m = *m_head;
2640	int error;
2641
2642	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2643
2644	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2645	    m, segs, nsegs, BUS_DMA_NOWAIT);
2646	if (error == EFBIG) {
2647		struct mbuf *m_new;
2648
2649		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2650		if (m_new == NULL)
2651			return ENOBUFS;
2652		else
2653			*m_head = m = m_new;
2654		txr->hn_tx_collapsed++;
2655
2656		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2657		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2658	}
2659	if (!error) {
2660		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2661		    BUS_DMASYNC_PREWRITE);
2662		txd->flags |= HN_TXD_FLAG_DMAMAP;
2663	}
2664	return error;
2665}
2666
2667static __inline int
2668hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2669{
2670
2671	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2672	    ("put an onlist txd %#x", txd->flags));
2673	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2674	    ("put an onagg txd %#x", txd->flags));
2675
2676	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2677	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2678		return 0;
2679
2680	if (!STAILQ_EMPTY(&txd->agg_list)) {
2681		struct hn_txdesc *tmp_txd;
2682
2683		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2684			int freed;
2685
2686			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2687			    ("resursive aggregation on aggregated txdesc"));
2688			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2689			    ("not aggregated txdesc"));
2690			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2691			    ("aggregated txdesc uses dmamap"));
2692			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2693			    ("aggregated txdesc consumes "
2694			     "chimney sending buffer"));
2695			KASSERT(tmp_txd->chim_size == 0,
2696			    ("aggregated txdesc has non-zero "
2697			     "chimney sending size"));
2698
2699			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2700			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2701			freed = hn_txdesc_put(txr, tmp_txd);
2702			KASSERT(freed, ("failed to free aggregated txdesc"));
2703		}
2704	}
2705
2706	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2707		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2708		    ("chim txd uses dmamap"));
2709		hn_chim_free(txr->hn_sc, txd->chim_index);
2710		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2711		txd->chim_size = 0;
2712	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2713		bus_dmamap_sync(txr->hn_tx_data_dtag,
2714		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2715		bus_dmamap_unload(txr->hn_tx_data_dtag,
2716		    txd->data_dmap);
2717		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2718	}
2719
2720	if (txd->m != NULL) {
2721		m_freem(txd->m);
2722		txd->m = NULL;
2723	}
2724
2725	txd->flags |= HN_TXD_FLAG_ONLIST;
2726#ifndef HN_USE_TXDESC_BUFRING
2727	mtx_lock_spin(&txr->hn_txlist_spin);
2728	KASSERT(txr->hn_txdesc_avail >= 0 &&
2729	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2730	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2731	txr->hn_txdesc_avail++;
2732	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2733	mtx_unlock_spin(&txr->hn_txlist_spin);
2734#else	/* HN_USE_TXDESC_BUFRING */
2735#ifdef HN_DEBUG
2736	atomic_add_int(&txr->hn_txdesc_avail, 1);
2737#endif
2738	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2739#endif	/* !HN_USE_TXDESC_BUFRING */
2740
2741	return 1;
2742}
2743
2744static __inline struct hn_txdesc *
2745hn_txdesc_get(struct hn_tx_ring *txr)
2746{
2747	struct hn_txdesc *txd;
2748
2749#ifndef HN_USE_TXDESC_BUFRING
2750	mtx_lock_spin(&txr->hn_txlist_spin);
2751	txd = SLIST_FIRST(&txr->hn_txlist);
2752	if (txd != NULL) {
2753		KASSERT(txr->hn_txdesc_avail > 0,
2754		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2755		txr->hn_txdesc_avail--;
2756		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2757	}
2758	mtx_unlock_spin(&txr->hn_txlist_spin);
2759#else
2760	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2761#endif
2762
2763	if (txd != NULL) {
2764#ifdef HN_USE_TXDESC_BUFRING
2765#ifdef HN_DEBUG
2766		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2767#endif
2768#endif	/* HN_USE_TXDESC_BUFRING */
2769		KASSERT(txd->m == NULL && txd->refs == 0 &&
2770		    STAILQ_EMPTY(&txd->agg_list) &&
2771		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2772		    txd->chim_size == 0 &&
2773		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2774		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2775		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2776		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2777		txd->refs = 1;
2778	}
2779	return txd;
2780}
2781
2782static __inline void
2783hn_txdesc_hold(struct hn_txdesc *txd)
2784{
2785
2786	/* 0->1 transition will never work */
2787	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2788	atomic_add_int(&txd->refs, 1);
2789}
2790
2791static __inline void
2792hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2793{
2794
2795	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2796	    ("recursive aggregation on aggregating txdesc"));
2797
2798	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2799	    ("already aggregated"));
2800	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2801	    ("recursive aggregation on to-be-aggregated txdesc"));
2802
2803	txd->flags |= HN_TXD_FLAG_ONAGG;
2804	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2805}
2806
2807static bool
2808hn_tx_ring_pending(struct hn_tx_ring *txr)
2809{
2810	bool pending = false;
2811
2812#ifndef HN_USE_TXDESC_BUFRING
2813	mtx_lock_spin(&txr->hn_txlist_spin);
2814	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2815		pending = true;
2816	mtx_unlock_spin(&txr->hn_txlist_spin);
2817#else
2818	if (!buf_ring_full(txr->hn_txdesc_br))
2819		pending = true;
2820#endif
2821	return (pending);
2822}
2823
2824static __inline void
2825hn_txeof(struct hn_tx_ring *txr)
2826{
2827	txr->hn_has_txeof = 0;
2828	txr->hn_txeof(txr);
2829}
2830
2831static void
2832hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2833    struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2834{
2835	struct hn_txdesc *txd = sndc->hn_cbarg;
2836	struct hn_tx_ring *txr;
2837
2838	txr = txd->txr;
2839	KASSERT(txr->hn_chan == chan,
2840	    ("channel mismatch, on chan%u, should be chan%u",
2841	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2842
2843	txr->hn_has_txeof = 1;
2844	hn_txdesc_put(txr, txd);
2845
2846	++txr->hn_txdone_cnt;
2847	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2848		txr->hn_txdone_cnt = 0;
2849		if (txr->hn_oactive)
2850			hn_txeof(txr);
2851	}
2852}
2853
2854static void
2855hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2856{
2857#if defined(INET) || defined(INET6)
2858	struct lro_ctrl *lro = &rxr->hn_lro;
2859	struct lro_entry *queued;
2860
2861	while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
2862		SLIST_REMOVE_HEAD(&lro->lro_active, next);
2863		tcp_lro_flush(lro, queued);
2864	}
2865#endif
2866
2867	/*
2868	 * NOTE:
2869	 * 'txr' could be NULL, if multiple channels and
2870	 * ifnet.if_start method are enabled.
2871	 */
2872	if (txr == NULL || !txr->hn_has_txeof)
2873		return;
2874
2875	txr->hn_txdone_cnt = 0;
2876	hn_txeof(txr);
2877}
2878
2879static __inline uint32_t
2880hn_rndis_pktmsg_offset(uint32_t ofs)
2881{
2882
2883	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2884	    ("invalid RNDIS packet msg offset %u", ofs));
2885	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2886}
2887
2888static __inline void *
2889hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2890    size_t pi_dlen, uint32_t pi_type)
2891{
2892	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2893	struct rndis_pktinfo *pi;
2894
2895	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2896	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2897
2898	/*
2899	 * Per-packet-info does not move; it only grows.
2900	 *
2901	 * NOTE:
2902	 * rm_pktinfooffset in this phase counts from the beginning
2903	 * of rndis_packet_msg.
2904	 */
2905	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2906	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2907	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2908	    pkt->rm_pktinfolen);
2909	pkt->rm_pktinfolen += pi_size;
2910
2911	pi->rm_size = pi_size;
2912	pi->rm_type = pi_type;
2913	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2914
2915	return (pi->rm_data);
2916}
2917
2918static __inline int
2919hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2920{
2921	struct hn_txdesc *txd;
2922	struct mbuf *m;
2923	int error, pkts;
2924
2925	txd = txr->hn_agg_txd;
2926	KASSERT(txd != NULL, ("no aggregate txdesc"));
2927
2928	/*
2929	 * Since hn_txpkt() will reset this temporary stat, save
2930	 * it now, so that oerrors can be updated properly, if
2931	 * hn_txpkt() ever fails.
2932	 */
2933	pkts = txr->hn_stat_pkts;
2934
2935	/*
2936	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2937	 * failure, save it for later freeing, if hn_txpkt() ever
2938	 * fails.
2939	 */
2940	m = txd->m;
2941	error = hn_txpkt(ifp, txr, txd);
2942	if (__predict_false(error)) {
2943		/* txd is freed, but m is not. */
2944		m_freem(m);
2945
2946		txr->hn_flush_failed++;
2947		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2948	}
2949
2950	/* Reset all aggregation states. */
2951	txr->hn_agg_txd = NULL;
2952	txr->hn_agg_szleft = 0;
2953	txr->hn_agg_pktleft = 0;
2954	txr->hn_agg_prevpkt = NULL;
2955
2956	return (error);
2957}
2958
2959static void *
2960hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2961    int pktsize)
2962{
2963	void *chim;
2964
2965	if (txr->hn_agg_txd != NULL) {
2966		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2967			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2968			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2969			int olen;
2970
2971			/*
2972			 * Update the previous RNDIS packet's total length,
2973			 * it can be increased due to the mandatory alignment
2974			 * padding for this RNDIS packet.  And update the
2975			 * aggregating txdesc's chimney sending buffer size
2976			 * accordingly.
2977			 *
2978			 * XXX
2979			 * Zero-out the padding, as required by the RNDIS spec.
2980			 */
2981			olen = pkt->rm_len;
2982			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2983			agg_txd->chim_size += pkt->rm_len - olen;
2984
2985			/* Link this txdesc to the parent. */
2986			hn_txdesc_agg(agg_txd, txd);
2987
2988			chim = (uint8_t *)pkt + pkt->rm_len;
2989			/* Save the current packet for later fixup. */
2990			txr->hn_agg_prevpkt = chim;
2991
2992			txr->hn_agg_pktleft--;
2993			txr->hn_agg_szleft -= pktsize;
2994			if (txr->hn_agg_szleft <=
2995			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2996				/*
2997				 * Probably can't aggregate more packets,
2998				 * flush this aggregating txdesc proactively.
2999				 */
3000				txr->hn_agg_pktleft = 0;
3001			}
3002			/* Done! */
3003			return (chim);
3004		}
3005		hn_flush_txagg(ifp, txr);
3006	}
3007	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3008
3009	txr->hn_tx_chimney_tried++;
3010	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3011	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3012		return (NULL);
3013	txr->hn_tx_chimney++;
3014
3015	chim = txr->hn_sc->hn_chim +
3016	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3017
3018	if (txr->hn_agg_pktmax > 1 &&
3019	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3020		txr->hn_agg_txd = txd;
3021		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3022		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3023		txr->hn_agg_prevpkt = chim;
3024	}
3025	return (chim);
3026}
3027
3028/*
3029 * NOTE:
3030 * If this function fails, then both txd and m_head0 will be freed.
3031 */
3032static int
3033hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3034    struct mbuf **m_head0)
3035{
3036	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3037	int error, nsegs, i;
3038	struct mbuf *m_head = *m_head0;
3039	struct rndis_packet_msg *pkt;
3040	uint32_t *pi_data;
3041	void *chim = NULL;
3042	int pkt_hlen, pkt_size;
3043
3044	pkt = txd->rndis_pkt;
3045	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3046	if (pkt_size < txr->hn_chim_size) {
3047		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3048		if (chim != NULL)
3049			pkt = chim;
3050	} else {
3051		if (txr->hn_agg_txd != NULL)
3052			hn_flush_txagg(ifp, txr);
3053	}
3054
3055	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3056	pkt->rm_len = m_head->m_pkthdr.len;
3057	pkt->rm_dataoffset = 0;
3058	pkt->rm_datalen = m_head->m_pkthdr.len;
3059	pkt->rm_oobdataoffset = 0;
3060	pkt->rm_oobdatalen = 0;
3061	pkt->rm_oobdataelements = 0;
3062	pkt->rm_pktinfooffset = sizeof(*pkt);
3063	pkt->rm_pktinfolen = 0;
3064	pkt->rm_vchandle = 0;
3065	pkt->rm_reserved = 0;
3066
3067	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3068		/*
3069		 * Set the hash value for this packet, so that the host could
3070		 * dispatch the TX done event for this packet back to this TX
3071		 * ring's channel.
3072		 */
3073		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3074		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3075		*pi_data = txr->hn_tx_idx;
3076	}
3077
3078	if (m_head->m_flags & M_VLANTAG) {
3079		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3080		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3081		*pi_data = NDIS_VLAN_INFO_MAKE(
3082		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3083		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3084		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3085	}
3086
3087	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3088#if defined(INET6) || defined(INET)
3089		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3090		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3091#ifdef INET
3092		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3093			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3094			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3095			    m_head->m_pkthdr.tso_segsz);
3096		}
3097#endif
3098#if defined(INET6) && defined(INET)
3099		else
3100#endif
3101#ifdef INET6
3102		{
3103			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3104			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3105			    m_head->m_pkthdr.tso_segsz);
3106		}
3107#endif
3108#endif	/* INET6 || INET */
3109	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3110		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3111		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3112		if (m_head->m_pkthdr.csum_flags &
3113		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3114			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3115		} else {
3116			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3117			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3118				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3119		}
3120
3121		if (m_head->m_pkthdr.csum_flags &
3122		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3123			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3124			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3125		} else if (m_head->m_pkthdr.csum_flags &
3126		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3127			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3128			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3129		}
3130	}
3131
3132	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3133	/* Fixup RNDIS packet message total length */
3134	pkt->rm_len += pkt_hlen;
3135	/* Convert RNDIS packet message offsets */
3136	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3137	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3138
3139	/*
3140	 * Fast path: Chimney sending.
3141	 */
3142	if (chim != NULL) {
3143		struct hn_txdesc *tgt_txd = txd;
3144
3145		if (txr->hn_agg_txd != NULL) {
3146			tgt_txd = txr->hn_agg_txd;
3147#ifdef INVARIANTS
3148			*m_head0 = NULL;
3149#endif
3150		}
3151
3152		KASSERT(pkt == chim,
3153		    ("RNDIS pkt not in chimney sending buffer"));
3154		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3155		    ("chimney sending buffer is not used"));
3156		tgt_txd->chim_size += pkt->rm_len;
3157
3158		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3159		    ((uint8_t *)chim) + pkt_hlen);
3160
3161		txr->hn_gpa_cnt = 0;
3162		txr->hn_sendpkt = hn_txpkt_chim;
3163		goto done;
3164	}
3165
3166	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3167	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3168	    ("chimney buffer is used"));
3169	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3170
3171	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3172	if (__predict_false(error)) {
3173		int freed;
3174
3175		/*
3176		 * This mbuf is not linked w/ the txd yet, so free it now.
3177		 */
3178		m_freem(m_head);
3179		*m_head0 = NULL;
3180
3181		freed = hn_txdesc_put(txr, txd);
3182		KASSERT(freed != 0,
3183		    ("fail to free txd upon txdma error"));
3184
3185		txr->hn_txdma_failed++;
3186		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3187		return error;
3188	}
3189	*m_head0 = m_head;
3190
3191	/* +1 RNDIS packet message */
3192	txr->hn_gpa_cnt = nsegs + 1;
3193
3194	/* send packet with page buffer */
3195	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3196	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3197	txr->hn_gpa[0].gpa_len = pkt_hlen;
3198
3199	/*
3200	 * Fill the page buffers with mbuf info after the page
3201	 * buffer for RNDIS packet message.
3202	 */
3203	for (i = 0; i < nsegs; ++i) {
3204		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3205
3206		gpa->gpa_page = atop(segs[i].ds_addr);
3207		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3208		gpa->gpa_len = segs[i].ds_len;
3209	}
3210
3211	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3212	txd->chim_size = 0;
3213	txr->hn_sendpkt = hn_txpkt_sglist;
3214done:
3215	txd->m = m_head;
3216
3217	/* Set the completion routine */
3218	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3219
3220	/* Update temporary stats for later use. */
3221	txr->hn_stat_pkts++;
3222	txr->hn_stat_size += m_head->m_pkthdr.len;
3223	if (m_head->m_flags & M_MCAST)
3224		txr->hn_stat_mcasts++;
3225
3226	return 0;
3227}
3228
3229/*
3230 * NOTE:
3231 * If this function fails, then txd will be freed, but the mbuf
3232 * associated w/ the txd will _not_ be freed.
3233 */
3234static int
3235hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3236{
3237	int error, send_failed = 0, has_bpf;
3238
3239again:
3240	has_bpf = bpf_peers_present(ifp->if_bpf);
3241	if (has_bpf) {
3242		/*
3243		 * Make sure that this txd and any aggregated txds are not
3244		 * freed before ETHER_BPF_MTAP.
3245		 */
3246		hn_txdesc_hold(txd);
3247	}
3248	error = txr->hn_sendpkt(txr, txd);
3249	if (!error) {
3250		if (has_bpf) {
3251			const struct hn_txdesc *tmp_txd;
3252
3253			ETHER_BPF_MTAP(ifp, txd->m);
3254			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3255				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3256		}
3257
3258		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3259#ifdef HN_IFSTART_SUPPORT
3260		if (!hn_use_if_start)
3261#endif
3262		{
3263			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3264			    txr->hn_stat_size);
3265			if (txr->hn_stat_mcasts != 0) {
3266				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3267				    txr->hn_stat_mcasts);
3268			}
3269		}
3270		txr->hn_pkts += txr->hn_stat_pkts;
3271		txr->hn_sends++;
3272	}
3273	if (has_bpf)
3274		hn_txdesc_put(txr, txd);
3275
3276	if (__predict_false(error)) {
3277		int freed;
3278
3279		/*
3280		 * This should "really rarely" happen.
3281		 *
3282		 * XXX Too many RX to be acked or too many sideband
3283		 * commands to run?  Ask netvsc_channel_rollup()
3284		 * to kick start later.
3285		 */
3286		txr->hn_has_txeof = 1;
3287		if (!send_failed) {
3288			txr->hn_send_failed++;
3289			send_failed = 1;
3290			/*
3291			 * Try sending again after set hn_has_txeof;
3292			 * in case that we missed the last
3293			 * netvsc_channel_rollup().
3294			 */
3295			goto again;
3296		}
3297		if_printf(ifp, "send failed\n");
3298
3299		/*
3300		 * Caller will perform further processing on the
3301		 * associated mbuf, so don't free it in hn_txdesc_put();
3302		 * only unload it from the DMA map in hn_txdesc_put(),
3303		 * if it was loaded.
3304		 */
3305		txd->m = NULL;
3306		freed = hn_txdesc_put(txr, txd);
3307		KASSERT(freed != 0,
3308		    ("fail to free txd upon send error"));
3309
3310		txr->hn_send_failed++;
3311	}
3312
3313	/* Reset temporary stats, after this sending is done. */
3314	txr->hn_stat_size = 0;
3315	txr->hn_stat_pkts = 0;
3316	txr->hn_stat_mcasts = 0;
3317
3318	return (error);
3319}
3320
3321/*
3322 * Append the specified data to the indicated mbuf chain,
3323 * Extend the mbuf chain if the new data does not fit in
3324 * existing space.
3325 *
3326 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3327 * There should be an equivalent in the kernel mbuf code,
3328 * but there does not appear to be one yet.
3329 *
3330 * Differs from m_append() in that additional mbufs are
3331 * allocated with cluster size MJUMPAGESIZE, and filled
3332 * accordingly.
3333 *
3334 * Return 1 if able to complete the job; otherwise 0.
3335 */
3336static int
3337hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3338{
3339	struct mbuf *m, *n;
3340	int remainder, space;
3341
3342	for (m = m0; m->m_next != NULL; m = m->m_next)
3343		;
3344	remainder = len;
3345	space = M_TRAILINGSPACE(m);
3346	if (space > 0) {
3347		/*
3348		 * Copy into available space.
3349		 */
3350		if (space > remainder)
3351			space = remainder;
3352		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3353		m->m_len += space;
3354		cp += space;
3355		remainder -= space;
3356	}
3357	while (remainder > 0) {
3358		/*
3359		 * Allocate a new mbuf; could check space
3360		 * and allocate a cluster instead.
3361		 */
3362		n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
3363		if (n == NULL)
3364			break;
3365		n->m_len = min(MJUMPAGESIZE, remainder);
3366		bcopy(cp, mtod(n, caddr_t), n->m_len);
3367		cp += n->m_len;
3368		remainder -= n->m_len;
3369		m->m_next = n;
3370		m = n;
3371	}
3372	if (m0->m_flags & M_PKTHDR)
3373		m0->m_pkthdr.len += len - remainder;
3374
3375	return (remainder == 0);
3376}
3377
3378#if defined(INET) || defined(INET6)
3379static __inline int
3380hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3381{
3382#if __FreeBSD_version >= 1100095
3383	if (hn_lro_mbufq_depth) {
3384		tcp_lro_queue_mbuf(lc, m);
3385		return 0;
3386	}
3387#endif
3388	return tcp_lro_rx(lc, m, 0);
3389}
3390#endif
3391
3392static int
3393hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3394    const struct hn_rxinfo *info)
3395{
3396	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3397	struct mbuf *m_new;
3398	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3399	int hash_type = M_HASHTYPE_NONE;
3400	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3401
3402	ifp = hn_ifp;
3403	if (rxr->hn_rxvf_ifp != NULL) {
3404		/*
3405		 * Non-transparent mode VF; pretend this packet is from
3406		 * the VF.
3407		 */
3408		ifp = rxr->hn_rxvf_ifp;
3409		is_vf = 1;
3410	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3411		/* Transparent mode VF. */
3412		is_vf = 1;
3413	}
3414
3415	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3416		/*
3417		 * NOTE:
3418		 * See the NOTE of hn_rndis_init_fixat().  This
3419		 * function can be reached, immediately after the
3420		 * RNDIS is initialized but before the ifnet is
3421		 * setup on the hn_attach() path; drop the unexpected
3422		 * packets.
3423		 */
3424		return (0);
3425	}
3426
3427	if (__predict_false(dlen < ETHER_HDR_LEN)) {
3428		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3429		return (0);
3430	}
3431
3432	if (dlen <= MHLEN) {
3433		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3434		if (m_new == NULL) {
3435			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3436			return (0);
3437		}
3438		memcpy(mtod(m_new, void *), data, dlen);
3439		m_new->m_pkthdr.len = m_new->m_len = dlen;
3440		rxr->hn_small_pkts++;
3441	} else {
3442		/*
3443		 * Get an mbuf with a cluster.  For packets 2K or less,
3444		 * get a standard 2K cluster.  For anything larger, get a
3445		 * 4K cluster.  Any buffers larger than 4K can cause problems
3446		 * if looped around to the Hyper-V TX channel, so avoid them.
3447		 */
3448		size = MCLBYTES;
3449		if (dlen > MCLBYTES) {
3450			/* 4096 */
3451			size = MJUMPAGESIZE;
3452		}
3453
3454		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3455		if (m_new == NULL) {
3456			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3457			return (0);
3458		}
3459
3460		hv_m_append(m_new, dlen, data);
3461	}
3462	m_new->m_pkthdr.rcvif = ifp;
3463
3464	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3465		do_csum = 0;
3466
3467	/* receive side checksum offload */
3468	if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3469		/* IP csum offload */
3470		if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3471			m_new->m_pkthdr.csum_flags |=
3472			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3473			rxr->hn_csum_ip++;
3474		}
3475
3476		/* TCP/UDP csum offload */
3477		if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3478		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3479			m_new->m_pkthdr.csum_flags |=
3480			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3481			m_new->m_pkthdr.csum_data = 0xffff;
3482			if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3483				rxr->hn_csum_tcp++;
3484			else
3485				rxr->hn_csum_udp++;
3486		}
3487
3488		/*
3489		 * XXX
3490		 * As of this write (Oct 28th, 2016), host side will turn
3491		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3492		 * the do_lro setting here is actually _not_ accurate.  We
3493		 * depend on the RSS hash type check to reset do_lro.
3494		 */
3495		if ((info->csum_info &
3496		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3497		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3498			do_lro = 1;
3499	} else {
3500		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3501		if (l3proto == ETHERTYPE_IP) {
3502			if (l4proto == IPPROTO_TCP) {
3503				if (do_csum &&
3504				    (rxr->hn_trust_hcsum &
3505				     HN_TRUST_HCSUM_TCP)) {
3506					rxr->hn_csum_trusted++;
3507					m_new->m_pkthdr.csum_flags |=
3508					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3509					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3510					m_new->m_pkthdr.csum_data = 0xffff;
3511				}
3512				do_lro = 1;
3513			} else if (l4proto == IPPROTO_UDP) {
3514				if (do_csum &&
3515				    (rxr->hn_trust_hcsum &
3516				     HN_TRUST_HCSUM_UDP)) {
3517					rxr->hn_csum_trusted++;
3518					m_new->m_pkthdr.csum_flags |=
3519					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3520					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3521					m_new->m_pkthdr.csum_data = 0xffff;
3522				}
3523			} else if (l4proto != IPPROTO_DONE && do_csum &&
3524			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3525				rxr->hn_csum_trusted++;
3526				m_new->m_pkthdr.csum_flags |=
3527				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3528			}
3529		}
3530	}
3531
3532	if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3533		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3534		    NDIS_VLAN_INFO_ID(info->vlan_info),
3535		    NDIS_VLAN_INFO_PRI(info->vlan_info),
3536		    NDIS_VLAN_INFO_CFI(info->vlan_info));
3537		m_new->m_flags |= M_VLANTAG;
3538	}
3539
3540	/*
3541	 * If VF is activated (tranparent/non-transparent mode does not
3542	 * matter here).
3543	 *
3544	 * - Disable LRO
3545	 *
3546	 *   hn(4) will only receive broadcast packets, multicast packets,
3547	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3548	 *   packet types.
3549	 *
3550	 *   For non-transparent, we definitely _cannot_ enable LRO at
3551	 *   all, since the LRO flush will use hn(4) as the receiving
3552	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3553	 */
3554	if (is_vf)
3555		do_lro = 0;
3556
3557	/*
3558	 * If VF is activated (tranparent/non-transparent mode does not
3559	 * matter here), do _not_ mess with unsupported hash types or
3560	 * functions.
3561	 */
3562	if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3563		rxr->hn_rss_pkts++;
3564		m_new->m_pkthdr.flowid = info->hash_value;
3565		if (!is_vf)
3566			hash_type = M_HASHTYPE_OPAQUE;
3567		if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3568		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3569			uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3570			    rxr->hn_mbuf_hash);
3571
3572			/*
3573			 * NOTE:
3574			 * do_lro is resetted, if the hash types are not TCP
3575			 * related.  See the comment in the above csum_flags
3576			 * setup section.
3577			 */
3578			switch (type) {
3579			case NDIS_HASH_IPV4:
3580				hash_type = M_HASHTYPE_RSS_IPV4;
3581				do_lro = 0;
3582				break;
3583
3584			case NDIS_HASH_TCP_IPV4:
3585				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3586				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3587					int def_htype = M_HASHTYPE_OPAQUE;
3588
3589					if (is_vf)
3590						def_htype = M_HASHTYPE_NONE;
3591
3592					/*
3593					 * UDP 4-tuple hash is delivered as
3594					 * TCP 4-tuple hash.
3595					 */
3596					if (l3proto == ETHERTYPE_MAX) {
3597						hn_rxpkt_proto(m_new,
3598						    &l3proto, &l4proto);
3599					}
3600					if (l3proto == ETHERTYPE_IP) {
3601						if (l4proto == IPPROTO_UDP &&
3602						    (rxr->hn_mbuf_hash &
3603						     NDIS_HASH_UDP_IPV4_X)) {
3604							hash_type =
3605							M_HASHTYPE_RSS_UDP_IPV4;
3606							do_lro = 0;
3607						} else if (l4proto !=
3608						    IPPROTO_TCP) {
3609							hash_type = def_htype;
3610							do_lro = 0;
3611						}
3612					} else {
3613						hash_type = def_htype;
3614						do_lro = 0;
3615					}
3616				}
3617				break;
3618
3619			case NDIS_HASH_IPV6:
3620				hash_type = M_HASHTYPE_RSS_IPV6;
3621				do_lro = 0;
3622				break;
3623
3624			case NDIS_HASH_IPV6_EX:
3625				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3626				do_lro = 0;
3627				break;
3628
3629			case NDIS_HASH_TCP_IPV6:
3630				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3631				break;
3632
3633			case NDIS_HASH_TCP_IPV6_EX:
3634				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3635				break;
3636			}
3637		}
3638	} else if (!is_vf) {
3639		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3640	}
3641	M_HASHTYPE_SET(m_new, hash_type);
3642
3643	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3644	if (hn_ifp != ifp) {
3645		const struct ether_header *eh;
3646
3647		/*
3648		 * Non-transparent mode VF is activated.
3649		 */
3650
3651		/*
3652		 * Allow tapping on hn(4).
3653		 */
3654		ETHER_BPF_MTAP(hn_ifp, m_new);
3655
3656		/*
3657		 * Update hn(4)'s stats.
3658		 */
3659		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3660		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3661		/* Checked at the beginning of this function. */
3662		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3663		eh = mtod(m_new, struct ether_header *);
3664		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3665			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3666	}
3667	rxr->hn_pkts++;
3668
3669	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3670#if defined(INET) || defined(INET6)
3671		struct lro_ctrl *lro = &rxr->hn_lro;
3672
3673		if (lro->lro_cnt) {
3674			rxr->hn_lro_tried++;
3675			if (hn_lro_rx(lro, m_new) == 0) {
3676				/* DONE! */
3677				return 0;
3678			}
3679		}
3680#endif
3681	}
3682	ifp->if_input(ifp, m_new);
3683
3684	return (0);
3685}
3686
3687static int
3688hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3689{
3690	struct hn_softc *sc = ifp->if_softc;
3691	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3692	struct ifnet *vf_ifp;
3693	int mask, error = 0;
3694	struct ifrsskey *ifrk;
3695	struct ifrsshash *ifrh;
3696	uint32_t mtu;
3697
3698	switch (cmd) {
3699	case SIOCSIFMTU:
3700		if (ifr->ifr_mtu > HN_MTU_MAX) {
3701			error = EINVAL;
3702			break;
3703		}
3704
3705		HN_LOCK(sc);
3706
3707		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3708			HN_UNLOCK(sc);
3709			break;
3710		}
3711
3712		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3713			/* Can't change MTU */
3714			HN_UNLOCK(sc);
3715			error = EOPNOTSUPP;
3716			break;
3717		}
3718
3719		if (ifp->if_mtu == ifr->ifr_mtu) {
3720			HN_UNLOCK(sc);
3721			break;
3722		}
3723
3724		if (hn_xpnt_vf_isready(sc)) {
3725			vf_ifp = sc->hn_vf_ifp;
3726			ifr_vf = *ifr;
3727			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3728			    sizeof(ifr_vf.ifr_name));
3729			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3730			    (caddr_t)&ifr_vf);
3731			if (error) {
3732				HN_UNLOCK(sc);
3733				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3734				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3735				break;
3736			}
3737		}
3738
3739		/*
3740		 * Suspend this interface before the synthetic parts
3741		 * are ripped.
3742		 */
3743		hn_suspend(sc);
3744
3745		/*
3746		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3747		 */
3748		hn_synth_detach(sc);
3749
3750		/*
3751		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3752		 * with the new MTU setting.
3753		 */
3754		error = hn_synth_attach(sc, ifr->ifr_mtu);
3755		if (error) {
3756			HN_UNLOCK(sc);
3757			break;
3758		}
3759
3760		error = hn_rndis_get_mtu(sc, &mtu);
3761		if (error)
3762			mtu = ifr->ifr_mtu;
3763		else if (bootverbose)
3764			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3765
3766		/*
3767		 * Commit the requested MTU, after the synthetic parts
3768		 * have been successfully attached.
3769		 */
3770		if (mtu >= ifr->ifr_mtu) {
3771			mtu = ifr->ifr_mtu;
3772		} else {
3773			if_printf(ifp, "fixup mtu %d -> %u\n",
3774			    ifr->ifr_mtu, mtu);
3775		}
3776		ifp->if_mtu = mtu;
3777
3778		/*
3779		 * Synthetic parts' reattach may change the chimney
3780		 * sending size; update it.
3781		 */
3782		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3783			hn_set_chim_size(sc, sc->hn_chim_szmax);
3784
3785		/*
3786		 * Make sure that various parameters based on MTU are
3787		 * still valid, after the MTU change.
3788		 */
3789		hn_mtu_change_fixup(sc);
3790
3791		/*
3792		 * All done!  Resume the interface now.
3793		 */
3794		hn_resume(sc);
3795
3796		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3797		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3798			/*
3799			 * Since we have reattached the NVS part,
3800			 * change the datapath to VF again; in case
3801			 * that it is lost, after the NVS was detached.
3802			 */
3803			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3804		}
3805
3806		HN_UNLOCK(sc);
3807		break;
3808
3809	case SIOCSIFFLAGS:
3810		HN_LOCK(sc);
3811
3812		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3813			HN_UNLOCK(sc);
3814			break;
3815		}
3816
3817		if (hn_xpnt_vf_isready(sc))
3818			hn_xpnt_vf_saveifflags(sc);
3819
3820		if (ifp->if_flags & IFF_UP) {
3821			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3822				/*
3823				 * Caller meight hold mutex, e.g.
3824				 * bpf; use busy-wait for the RNDIS
3825				 * reply.
3826				 */
3827				HN_NO_SLEEPING(sc);
3828				hn_rxfilter_config(sc);
3829				HN_SLEEPING_OK(sc);
3830
3831				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3832					error = hn_xpnt_vf_iocsetflags(sc);
3833			} else {
3834				hn_init_locked(sc);
3835			}
3836		} else {
3837			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3838				hn_stop(sc, false);
3839		}
3840		sc->hn_if_flags = ifp->if_flags;
3841
3842		HN_UNLOCK(sc);
3843		break;
3844
3845	case SIOCSIFCAP:
3846		HN_LOCK(sc);
3847
3848		if (hn_xpnt_vf_isready(sc)) {
3849			ifr_vf = *ifr;
3850			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3851			    sizeof(ifr_vf.ifr_name));
3852			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3853			HN_UNLOCK(sc);
3854			break;
3855		}
3856
3857		/*
3858		 * Fix up requested capabilities w/ supported capabilities,
3859		 * since the supported capabilities could have been changed.
3860		 */
3861		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3862		    ifp->if_capenable;
3863
3864		if (mask & IFCAP_TXCSUM) {
3865			ifp->if_capenable ^= IFCAP_TXCSUM;
3866			if (ifp->if_capenable & IFCAP_TXCSUM)
3867				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3868			else
3869				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3870		}
3871		if (mask & IFCAP_TXCSUM_IPV6) {
3872			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3873			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3874				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3875			else
3876				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3877		}
3878
3879		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3880		if (mask & IFCAP_RXCSUM)
3881			ifp->if_capenable ^= IFCAP_RXCSUM;
3882#ifdef foo
3883		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3884		if (mask & IFCAP_RXCSUM_IPV6)
3885			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3886#endif
3887
3888		if (mask & IFCAP_LRO)
3889			ifp->if_capenable ^= IFCAP_LRO;
3890
3891		if (mask & IFCAP_TSO4) {
3892			ifp->if_capenable ^= IFCAP_TSO4;
3893			if (ifp->if_capenable & IFCAP_TSO4)
3894				ifp->if_hwassist |= CSUM_IP_TSO;
3895			else
3896				ifp->if_hwassist &= ~CSUM_IP_TSO;
3897		}
3898		if (mask & IFCAP_TSO6) {
3899			ifp->if_capenable ^= IFCAP_TSO6;
3900			if (ifp->if_capenable & IFCAP_TSO6)
3901				ifp->if_hwassist |= CSUM_IP6_TSO;
3902			else
3903				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3904		}
3905
3906		HN_UNLOCK(sc);
3907		break;
3908
3909	case SIOCADDMULTI:
3910	case SIOCDELMULTI:
3911		HN_LOCK(sc);
3912
3913		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3914			HN_UNLOCK(sc);
3915			break;
3916		}
3917		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3918			/*
3919			 * Multicast uses mutex; use busy-wait for
3920			 * the RNDIS reply.
3921			 */
3922			HN_NO_SLEEPING(sc);
3923			hn_rxfilter_config(sc);
3924			HN_SLEEPING_OK(sc);
3925		}
3926
3927		/* XXX vlan(4) style mcast addr maintenance */
3928		if (hn_xpnt_vf_isready(sc)) {
3929			int old_if_flags;
3930
3931			old_if_flags = sc->hn_vf_ifp->if_flags;
3932			hn_xpnt_vf_saveifflags(sc);
3933
3934			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3935			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3936			     IFF_ALLMULTI))
3937				error = hn_xpnt_vf_iocsetflags(sc);
3938		}
3939
3940		HN_UNLOCK(sc);
3941		break;
3942
3943	case SIOCSIFMEDIA:
3944	case SIOCGIFMEDIA:
3945		HN_LOCK(sc);
3946		if (hn_xpnt_vf_isready(sc)) {
3947			/*
3948			 * SIOCGIFMEDIA expects ifmediareq, so don't
3949			 * create and pass ifr_vf to the VF here; just
3950			 * replace the ifr_name.
3951			 */
3952			vf_ifp = sc->hn_vf_ifp;
3953			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3954			    sizeof(ifr->ifr_name));
3955			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3956			/* Restore the ifr_name. */
3957			strlcpy(ifr->ifr_name, ifp->if_xname,
3958			    sizeof(ifr->ifr_name));
3959			HN_UNLOCK(sc);
3960			break;
3961		}
3962		HN_UNLOCK(sc);
3963		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3964		break;
3965
3966	case SIOCGIFRSSHASH:
3967		ifrh = (struct ifrsshash *)data;
3968		HN_LOCK(sc);
3969		if (sc->hn_rx_ring_inuse == 1) {
3970			HN_UNLOCK(sc);
3971			ifrh->ifrh_func = RSS_FUNC_NONE;
3972			ifrh->ifrh_types = 0;
3973			break;
3974		}
3975
3976		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3977			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3978		else
3979			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3980		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3981		HN_UNLOCK(sc);
3982		break;
3983
3984	case SIOCGIFRSSKEY:
3985		ifrk = (struct ifrsskey *)data;
3986		HN_LOCK(sc);
3987		if (sc->hn_rx_ring_inuse == 1) {
3988			HN_UNLOCK(sc);
3989			ifrk->ifrk_func = RSS_FUNC_NONE;
3990			ifrk->ifrk_keylen = 0;
3991			break;
3992		}
3993		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3994			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3995		else
3996			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
3997		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
3998		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
3999		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4000		HN_UNLOCK(sc);
4001		break;
4002
4003	default:
4004		error = ether_ioctl(ifp, cmd, data);
4005		break;
4006	}
4007	return (error);
4008}
4009
4010static void
4011hn_stop(struct hn_softc *sc, bool detaching)
4012{
4013	struct ifnet *ifp = sc->hn_ifp;
4014	int i;
4015
4016	HN_LOCK_ASSERT(sc);
4017
4018	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4019	    ("synthetic parts were not attached"));
4020
4021	/* Clear RUNNING bit ASAP. */
4022	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4023
4024	/* Disable polling. */
4025	hn_polling(sc, 0);
4026
4027	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4028		KASSERT(sc->hn_vf_ifp != NULL,
4029		    ("%s: VF is not attached", ifp->if_xname));
4030
4031		/* Mark transparent mode VF as disabled. */
4032		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4033
4034		/*
4035		 * NOTE:
4036		 * Datapath setting must happen _before_ bringing
4037		 * the VF down.
4038		 */
4039		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4040
4041		/*
4042		 * Bring the VF down.
4043		 */
4044		hn_xpnt_vf_saveifflags(sc);
4045		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4046		hn_xpnt_vf_iocsetflags(sc);
4047	}
4048
4049	/* Suspend data transfers. */
4050	hn_suspend_data(sc);
4051
4052	/* Clear OACTIVE bit. */
4053	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4054	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4055		sc->hn_tx_ring[i].hn_oactive = 0;
4056
4057	/*
4058	 * If the non-transparent mode VF is active, make sure
4059	 * that the RX filter still allows packet reception.
4060	 */
4061	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4062		hn_rxfilter_config(sc);
4063}
4064
4065static void
4066hn_init_locked(struct hn_softc *sc)
4067{
4068	struct ifnet *ifp = sc->hn_ifp;
4069	int i;
4070
4071	HN_LOCK_ASSERT(sc);
4072
4073	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4074		return;
4075
4076	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4077		return;
4078
4079	/* Configure RX filter */
4080	hn_rxfilter_config(sc);
4081
4082	/* Clear OACTIVE bit. */
4083	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4084	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4085		sc->hn_tx_ring[i].hn_oactive = 0;
4086
4087	/* Clear TX 'suspended' bit. */
4088	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4089
4090	if (hn_xpnt_vf_isready(sc)) {
4091		/* Initialize transparent VF. */
4092		hn_xpnt_vf_init(sc);
4093	}
4094
4095	/* Everything is ready; unleash! */
4096	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4097
4098	/* Re-enable polling if requested. */
4099	if (sc->hn_pollhz > 0)
4100		hn_polling(sc, sc->hn_pollhz);
4101}
4102
4103static void
4104hn_init(void *xsc)
4105{
4106	struct hn_softc *sc = xsc;
4107
4108	HN_LOCK(sc);
4109	hn_init_locked(sc);
4110	HN_UNLOCK(sc);
4111}
4112
4113#if __FreeBSD_version >= 1100099
4114
4115static int
4116hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4117{
4118	struct hn_softc *sc = arg1;
4119	unsigned int lenlim;
4120	int error;
4121
4122	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4123	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4124	if (error || req->newptr == NULL)
4125		return error;
4126
4127	HN_LOCK(sc);
4128	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4129	    lenlim > TCP_LRO_LENGTH_MAX) {
4130		HN_UNLOCK(sc);
4131		return EINVAL;
4132	}
4133	hn_set_lro_lenlim(sc, lenlim);
4134	HN_UNLOCK(sc);
4135
4136	return 0;
4137}
4138
4139static int
4140hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4141{
4142	struct hn_softc *sc = arg1;
4143	int ackcnt, error, i;
4144
4145	/*
4146	 * lro_ackcnt_lim is append count limit,
4147	 * +1 to turn it into aggregation limit.
4148	 */
4149	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4150	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4151	if (error || req->newptr == NULL)
4152		return error;
4153
4154	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4155		return EINVAL;
4156
4157	/*
4158	 * Convert aggregation limit back to append
4159	 * count limit.
4160	 */
4161	--ackcnt;
4162	HN_LOCK(sc);
4163	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4164		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4165	HN_UNLOCK(sc);
4166	return 0;
4167}
4168
4169#endif
4170
4171static int
4172hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4173{
4174	struct hn_softc *sc = arg1;
4175	int hcsum = arg2;
4176	int on, error, i;
4177
4178	on = 0;
4179	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4180		on = 1;
4181
4182	error = sysctl_handle_int(oidp, &on, 0, req);
4183	if (error || req->newptr == NULL)
4184		return error;
4185
4186	HN_LOCK(sc);
4187	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4188		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4189
4190		if (on)
4191			rxr->hn_trust_hcsum |= hcsum;
4192		else
4193			rxr->hn_trust_hcsum &= ~hcsum;
4194	}
4195	HN_UNLOCK(sc);
4196	return 0;
4197}
4198
4199static int
4200hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4201{
4202	struct hn_softc *sc = arg1;
4203	int chim_size, error;
4204
4205	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4206	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4207	if (error || req->newptr == NULL)
4208		return error;
4209
4210	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4211		return EINVAL;
4212
4213	HN_LOCK(sc);
4214	hn_set_chim_size(sc, chim_size);
4215	HN_UNLOCK(sc);
4216	return 0;
4217}
4218
4219#if __FreeBSD_version < 1100095
4220static int
4221hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4222{
4223	struct hn_softc *sc = arg1;
4224	int ofs = arg2, i, error;
4225	struct hn_rx_ring *rxr;
4226	uint64_t stat;
4227
4228	stat = 0;
4229	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
4230		rxr = &sc->hn_rx_ring[i];
4231		stat += *((int *)((uint8_t *)rxr + ofs));
4232	}
4233
4234	error = sysctl_handle_64(oidp, &stat, 0, req);
4235	if (error || req->newptr == NULL)
4236		return error;
4237
4238	/* Zero out this stat. */
4239	for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
4240		rxr = &sc->hn_rx_ring[i];
4241		*((int *)((uint8_t *)rxr + ofs)) = 0;
4242	}
4243	return 0;
4244}
4245#else
4246static int
4247hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4248{
4249	struct hn_softc *sc = arg1;
4250	int ofs = arg2, i, error;
4251	struct hn_rx_ring *rxr;
4252	uint64_t stat;
4253
4254	stat = 0;
4255	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4256		rxr = &sc->hn_rx_ring[i];
4257		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4258	}
4259
4260	error = sysctl_handle_64(oidp, &stat, 0, req);
4261	if (error || req->newptr == NULL)
4262		return error;
4263
4264	/* Zero out this stat. */
4265	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4266		rxr = &sc->hn_rx_ring[i];
4267		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4268	}
4269	return 0;
4270}
4271
4272#endif
4273
4274static int
4275hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4276{
4277	struct hn_softc *sc = arg1;
4278	int ofs = arg2, i, error;
4279	struct hn_rx_ring *rxr;
4280	u_long stat;
4281
4282	stat = 0;
4283	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4284		rxr = &sc->hn_rx_ring[i];
4285		stat += *((u_long *)((uint8_t *)rxr + ofs));
4286	}
4287
4288	error = sysctl_handle_long(oidp, &stat, 0, req);
4289	if (error || req->newptr == NULL)
4290		return error;
4291
4292	/* Zero out this stat. */
4293	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4294		rxr = &sc->hn_rx_ring[i];
4295		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4296	}
4297	return 0;
4298}
4299
4300static int
4301hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4302{
4303	struct hn_softc *sc = arg1;
4304	int ofs = arg2, i, error;
4305	struct hn_tx_ring *txr;
4306	u_long stat;
4307
4308	stat = 0;
4309	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4310		txr = &sc->hn_tx_ring[i];
4311		stat += *((u_long *)((uint8_t *)txr + ofs));
4312	}
4313
4314	error = sysctl_handle_long(oidp, &stat, 0, req);
4315	if (error || req->newptr == NULL)
4316		return error;
4317
4318	/* Zero out this stat. */
4319	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4320		txr = &sc->hn_tx_ring[i];
4321		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4322	}
4323	return 0;
4324}
4325
4326static int
4327hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4328{
4329	struct hn_softc *sc = arg1;
4330	int ofs = arg2, i, error, conf;
4331	struct hn_tx_ring *txr;
4332
4333	txr = &sc->hn_tx_ring[0];
4334	conf = *((int *)((uint8_t *)txr + ofs));
4335
4336	error = sysctl_handle_int(oidp, &conf, 0, req);
4337	if (error || req->newptr == NULL)
4338		return error;
4339
4340	HN_LOCK(sc);
4341	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4342		txr = &sc->hn_tx_ring[i];
4343		*((int *)((uint8_t *)txr + ofs)) = conf;
4344	}
4345	HN_UNLOCK(sc);
4346
4347	return 0;
4348}
4349
4350static int
4351hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4352{
4353	struct hn_softc *sc = arg1;
4354	int error, size;
4355
4356	size = sc->hn_agg_size;
4357	error = sysctl_handle_int(oidp, &size, 0, req);
4358	if (error || req->newptr == NULL)
4359		return (error);
4360
4361	HN_LOCK(sc);
4362	sc->hn_agg_size = size;
4363	hn_set_txagg(sc);
4364	HN_UNLOCK(sc);
4365
4366	return (0);
4367}
4368
4369static int
4370hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4371{
4372	struct hn_softc *sc = arg1;
4373	int error, pkts;
4374
4375	pkts = sc->hn_agg_pkts;
4376	error = sysctl_handle_int(oidp, &pkts, 0, req);
4377	if (error || req->newptr == NULL)
4378		return (error);
4379
4380	HN_LOCK(sc);
4381	sc->hn_agg_pkts = pkts;
4382	hn_set_txagg(sc);
4383	HN_UNLOCK(sc);
4384
4385	return (0);
4386}
4387
4388static int
4389hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4390{
4391	struct hn_softc *sc = arg1;
4392	int pkts;
4393
4394	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4395	return (sysctl_handle_int(oidp, &pkts, 0, req));
4396}
4397
4398static int
4399hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4400{
4401	struct hn_softc *sc = arg1;
4402	int align;
4403
4404	align = sc->hn_tx_ring[0].hn_agg_align;
4405	return (sysctl_handle_int(oidp, &align, 0, req));
4406}
4407
4408static void
4409hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4410{
4411	if (pollhz == 0)
4412		vmbus_chan_poll_disable(chan);
4413	else
4414		vmbus_chan_poll_enable(chan, pollhz);
4415}
4416
4417static void
4418hn_polling(struct hn_softc *sc, u_int pollhz)
4419{
4420	int nsubch = sc->hn_rx_ring_inuse - 1;
4421
4422	HN_LOCK_ASSERT(sc);
4423
4424	if (nsubch > 0) {
4425		struct vmbus_channel **subch;
4426		int i;
4427
4428		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4429		for (i = 0; i < nsubch; ++i)
4430			hn_chan_polling(subch[i], pollhz);
4431		vmbus_subchan_rel(subch, nsubch);
4432	}
4433	hn_chan_polling(sc->hn_prichan, pollhz);
4434}
4435
4436static int
4437hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4438{
4439	struct hn_softc *sc = arg1;
4440	int pollhz, error;
4441
4442	pollhz = sc->hn_pollhz;
4443	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4444	if (error || req->newptr == NULL)
4445		return (error);
4446
4447	if (pollhz != 0 &&
4448	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4449		return (EINVAL);
4450
4451	HN_LOCK(sc);
4452	if (sc->hn_pollhz != pollhz) {
4453		sc->hn_pollhz = pollhz;
4454		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4455		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4456			hn_polling(sc, sc->hn_pollhz);
4457	}
4458	HN_UNLOCK(sc);
4459
4460	return (0);
4461}
4462
4463static int
4464hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4465{
4466	struct hn_softc *sc = arg1;
4467	char verstr[16];
4468
4469	snprintf(verstr, sizeof(verstr), "%u.%u",
4470	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4471	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4472	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4473}
4474
4475static int
4476hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4477{
4478	struct hn_softc *sc = arg1;
4479	char caps_str[128];
4480	uint32_t caps;
4481
4482	HN_LOCK(sc);
4483	caps = sc->hn_caps;
4484	HN_UNLOCK(sc);
4485	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4486	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4487}
4488
4489static int
4490hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4491{
4492	struct hn_softc *sc = arg1;
4493	char assist_str[128];
4494	uint32_t hwassist;
4495
4496	HN_LOCK(sc);
4497	hwassist = sc->hn_ifp->if_hwassist;
4498	HN_UNLOCK(sc);
4499	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4500	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4501}
4502
4503static int
4504hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4505{
4506	struct hn_softc *sc = arg1;
4507	char filter_str[128];
4508	uint32_t filter;
4509
4510	HN_LOCK(sc);
4511	filter = sc->hn_rx_filter;
4512	HN_UNLOCK(sc);
4513	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4514	    NDIS_PACKET_TYPES);
4515	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4516}
4517
4518static int
4519hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4520{
4521	struct hn_softc *sc = arg1;
4522	int error;
4523
4524	HN_LOCK(sc);
4525
4526	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4527	if (error || req->newptr == NULL)
4528		goto back;
4529
4530	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4531	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4532		/*
4533		 * RSS key is synchronized w/ VF's, don't allow users
4534		 * to change it.
4535		 */
4536		error = EBUSY;
4537		goto back;
4538	}
4539
4540	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4541	if (error)
4542		goto back;
4543	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4544
4545	if (sc->hn_rx_ring_inuse > 1) {
4546		error = hn_rss_reconfig(sc);
4547	} else {
4548		/* Not RSS capable, at least for now; just save the RSS key. */
4549		error = 0;
4550	}
4551back:
4552	HN_UNLOCK(sc);
4553	return (error);
4554}
4555
4556static int
4557hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4558{
4559	struct hn_softc *sc = arg1;
4560	int error;
4561
4562	HN_LOCK(sc);
4563
4564	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4565	if (error || req->newptr == NULL)
4566		goto back;
4567
4568	/*
4569	 * Don't allow RSS indirect table change, if this interface is not
4570	 * RSS capable currently.
4571	 */
4572	if (sc->hn_rx_ring_inuse == 1) {
4573		error = EOPNOTSUPP;
4574		goto back;
4575	}
4576
4577	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4578	if (error)
4579		goto back;
4580	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4581
4582	hn_rss_ind_fixup(sc);
4583	error = hn_rss_reconfig(sc);
4584back:
4585	HN_UNLOCK(sc);
4586	return (error);
4587}
4588
4589static int
4590hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4591{
4592	struct hn_softc *sc = arg1;
4593	char hash_str[128];
4594	uint32_t hash;
4595
4596	HN_LOCK(sc);
4597	hash = sc->hn_rss_hash;
4598	HN_UNLOCK(sc);
4599	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4600	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4601}
4602
4603static int
4604hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4605{
4606	struct hn_softc *sc = arg1;
4607	char hash_str[128];
4608	uint32_t hash;
4609
4610	HN_LOCK(sc);
4611	hash = sc->hn_rss_hcap;
4612	HN_UNLOCK(sc);
4613	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4614	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4615}
4616
4617static int
4618hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4619{
4620	struct hn_softc *sc = arg1;
4621	char hash_str[128];
4622	uint32_t hash;
4623
4624	HN_LOCK(sc);
4625	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4626	HN_UNLOCK(sc);
4627	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4628	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4629}
4630
4631static int
4632hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4633{
4634	struct hn_softc *sc = arg1;
4635	char vf_name[IFNAMSIZ + 1];
4636	struct ifnet *vf_ifp;
4637
4638	HN_LOCK(sc);
4639	vf_name[0] = '\0';
4640	vf_ifp = sc->hn_vf_ifp;
4641	if (vf_ifp != NULL)
4642		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4643	HN_UNLOCK(sc);
4644	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4645}
4646
4647static int
4648hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4649{
4650	struct hn_softc *sc = arg1;
4651	char vf_name[IFNAMSIZ + 1];
4652	struct ifnet *vf_ifp;
4653
4654	HN_LOCK(sc);
4655	vf_name[0] = '\0';
4656	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4657	if (vf_ifp != NULL)
4658		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4659	HN_UNLOCK(sc);
4660	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4661}
4662
4663static int
4664hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4665{
4666	struct rm_priotracker pt;
4667	struct sbuf *sb;
4668	int error, i;
4669	bool first;
4670
4671	error = sysctl_wire_old_buffer(req, 0);
4672	if (error != 0)
4673		return (error);
4674
4675	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4676	if (sb == NULL)
4677		return (ENOMEM);
4678
4679	rm_rlock(&hn_vfmap_lock, &pt);
4680
4681	first = true;
4682	for (i = 0; i < hn_vfmap_size; ++i) {
4683		struct ifnet *ifp;
4684
4685		if (hn_vfmap[i] == NULL)
4686			continue;
4687
4688		ifp = ifnet_byindex(i);
4689		if (ifp != NULL) {
4690			if (first)
4691				sbuf_printf(sb, "%s", ifp->if_xname);
4692			else
4693				sbuf_printf(sb, " %s", ifp->if_xname);
4694			first = false;
4695		}
4696	}
4697
4698	rm_runlock(&hn_vfmap_lock, &pt);
4699
4700	error = sbuf_finish(sb);
4701	sbuf_delete(sb);
4702	return (error);
4703}
4704
4705static int
4706hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4707{
4708	struct rm_priotracker pt;
4709	struct sbuf *sb;
4710	int error, i;
4711	bool first;
4712
4713	error = sysctl_wire_old_buffer(req, 0);
4714	if (error != 0)
4715		return (error);
4716
4717	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4718	if (sb == NULL)
4719		return (ENOMEM);
4720
4721	rm_rlock(&hn_vfmap_lock, &pt);
4722
4723	first = true;
4724	for (i = 0; i < hn_vfmap_size; ++i) {
4725		struct ifnet *ifp, *hn_ifp;
4726
4727		hn_ifp = hn_vfmap[i];
4728		if (hn_ifp == NULL)
4729			continue;
4730
4731		ifp = ifnet_byindex(i);
4732		if (ifp != NULL) {
4733			if (first) {
4734				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4735				    hn_ifp->if_xname);
4736			} else {
4737				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4738				    hn_ifp->if_xname);
4739			}
4740			first = false;
4741		}
4742	}
4743
4744	rm_runlock(&hn_vfmap_lock, &pt);
4745
4746	error = sbuf_finish(sb);
4747	sbuf_delete(sb);
4748	return (error);
4749}
4750
4751static int
4752hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4753{
4754	struct hn_softc *sc = arg1;
4755	int error, onoff = 0;
4756
4757	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4758		onoff = 1;
4759	error = sysctl_handle_int(oidp, &onoff, 0, req);
4760	if (error || req->newptr == NULL)
4761		return (error);
4762
4763	HN_LOCK(sc);
4764	/* NOTE: hn_vf_lock for hn_transmit() */
4765	rm_wlock(&sc->hn_vf_lock);
4766	if (onoff)
4767		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4768	else
4769		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4770	rm_wunlock(&sc->hn_vf_lock);
4771	HN_UNLOCK(sc);
4772
4773	return (0);
4774}
4775
4776static int
4777hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4778{
4779	struct hn_softc *sc = arg1;
4780	int enabled = 0;
4781
4782	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4783		enabled = 1;
4784	return (sysctl_handle_int(oidp, &enabled, 0, req));
4785}
4786
4787static int
4788hn_check_iplen(const struct mbuf *m, int hoff)
4789{
4790	const struct ip *ip;
4791	int len, iphlen, iplen;
4792	const struct tcphdr *th;
4793	int thoff;				/* TCP data offset */
4794
4795	len = hoff + sizeof(struct ip);
4796
4797	/* The packet must be at least the size of an IP header. */
4798	if (m->m_pkthdr.len < len)
4799		return IPPROTO_DONE;
4800
4801	/* The fixed IP header must reside completely in the first mbuf. */
4802	if (m->m_len < len)
4803		return IPPROTO_DONE;
4804
4805	ip = mtodo(m, hoff);
4806
4807	/* Bound check the packet's stated IP header length. */
4808	iphlen = ip->ip_hl << 2;
4809	if (iphlen < sizeof(struct ip))		/* minimum header length */
4810		return IPPROTO_DONE;
4811
4812	/* The full IP header must reside completely in the one mbuf. */
4813	if (m->m_len < hoff + iphlen)
4814		return IPPROTO_DONE;
4815
4816	iplen = ntohs(ip->ip_len);
4817
4818	/*
4819	 * Check that the amount of data in the buffers is as
4820	 * at least much as the IP header would have us expect.
4821	 */
4822	if (m->m_pkthdr.len < hoff + iplen)
4823		return IPPROTO_DONE;
4824
4825	/*
4826	 * Ignore IP fragments.
4827	 */
4828	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4829		return IPPROTO_DONE;
4830
4831	/*
4832	 * The TCP/IP or UDP/IP header must be entirely contained within
4833	 * the first fragment of a packet.
4834	 */
4835	switch (ip->ip_p) {
4836	case IPPROTO_TCP:
4837		if (iplen < iphlen + sizeof(struct tcphdr))
4838			return IPPROTO_DONE;
4839		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4840			return IPPROTO_DONE;
4841		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4842		thoff = th->th_off << 2;
4843		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4844			return IPPROTO_DONE;
4845		if (m->m_len < hoff + iphlen + thoff)
4846			return IPPROTO_DONE;
4847		break;
4848	case IPPROTO_UDP:
4849		if (iplen < iphlen + sizeof(struct udphdr))
4850			return IPPROTO_DONE;
4851		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4852			return IPPROTO_DONE;
4853		break;
4854	default:
4855		if (iplen < iphlen)
4856			return IPPROTO_DONE;
4857		break;
4858	}
4859	return ip->ip_p;
4860}
4861
4862static void
4863hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4864{
4865	const struct ether_header *eh;
4866	uint16_t etype;
4867	int hoff;
4868
4869	hoff = sizeof(*eh);
4870	/* Checked at the beginning of this function. */
4871	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4872
4873	eh = mtod(m_new, const struct ether_header *);
4874	etype = ntohs(eh->ether_type);
4875	if (etype == ETHERTYPE_VLAN) {
4876		const struct ether_vlan_header *evl;
4877
4878		hoff = sizeof(*evl);
4879		if (m_new->m_len < hoff)
4880			return;
4881		evl = mtod(m_new, const struct ether_vlan_header *);
4882		etype = ntohs(evl->evl_proto);
4883	}
4884	*l3proto = etype;
4885
4886	if (etype == ETHERTYPE_IP)
4887		*l4proto = hn_check_iplen(m_new, hoff);
4888	else
4889		*l4proto = IPPROTO_DONE;
4890}
4891
4892static int
4893hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4894{
4895	struct sysctl_oid_list *child;
4896	struct sysctl_ctx_list *ctx;
4897	device_t dev = sc->hn_dev;
4898#if defined(INET) || defined(INET6)
4899#if __FreeBSD_version >= 1100095
4900	int lroent_cnt;
4901#endif
4902#endif
4903	int i;
4904
4905	/*
4906	 * Create RXBUF for reception.
4907	 *
4908	 * NOTE:
4909	 * - It is shared by all channels.
4910	 * - A large enough buffer is allocated, certain version of NVSes
4911	 *   may further limit the usable space.
4912	 */
4913	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4914	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4915	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4916	if (sc->hn_rxbuf == NULL) {
4917		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4918		return (ENOMEM);
4919	}
4920
4921	sc->hn_rx_ring_cnt = ring_cnt;
4922	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4923
4924	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4925	    M_DEVBUF, M_WAITOK | M_ZERO);
4926
4927#if defined(INET) || defined(INET6)
4928#if __FreeBSD_version >= 1100095
4929	lroent_cnt = hn_lro_entry_count;
4930	if (lroent_cnt < TCP_LRO_ENTRIES)
4931		lroent_cnt = TCP_LRO_ENTRIES;
4932	if (bootverbose)
4933		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4934#endif
4935#endif	/* INET || INET6 */
4936
4937	ctx = device_get_sysctl_ctx(dev);
4938	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4939
4940	/* Create dev.hn.UNIT.rx sysctl tree */
4941	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4942	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4943
4944	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4945		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4946
4947		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4948		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4949		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
4950		if (rxr->hn_br == NULL) {
4951			device_printf(dev, "allocate bufring failed\n");
4952			return (ENOMEM);
4953		}
4954
4955		if (hn_trust_hosttcp)
4956			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4957		if (hn_trust_hostudp)
4958			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4959		if (hn_trust_hostip)
4960			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4961		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4962		rxr->hn_ifp = sc->hn_ifp;
4963		if (i < sc->hn_tx_ring_cnt)
4964			rxr->hn_txr = &sc->hn_tx_ring[i];
4965		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4966		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4967		rxr->hn_rx_idx = i;
4968		rxr->hn_rxbuf = sc->hn_rxbuf;
4969
4970		/*
4971		 * Initialize LRO.
4972		 */
4973#if defined(INET) || defined(INET6)
4974#if __FreeBSD_version >= 1100095
4975		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4976		    hn_lro_mbufq_depth);
4977#else
4978		tcp_lro_init(&rxr->hn_lro);
4979		rxr->hn_lro.ifp = sc->hn_ifp;
4980#endif
4981#if __FreeBSD_version >= 1100099
4982		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4983		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4984#endif
4985#endif	/* INET || INET6 */
4986
4987		if (sc->hn_rx_sysctl_tree != NULL) {
4988			char name[16];
4989
4990			/*
4991			 * Create per RX ring sysctl tree:
4992			 * dev.hn.UNIT.rx.RINGID
4993			 */
4994			snprintf(name, sizeof(name), "%d", i);
4995			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
4996			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
4997			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4998
4999			if (rxr->hn_rx_sysctl_tree != NULL) {
5000				SYSCTL_ADD_ULONG(ctx,
5001				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5002				    OID_AUTO, "packets", CTLFLAG_RW,
5003				    &rxr->hn_pkts, "# of packets received");
5004				SYSCTL_ADD_ULONG(ctx,
5005				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5006				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
5007				    &rxr->hn_rss_pkts,
5008				    "# of packets w/ RSS info received");
5009				SYSCTL_ADD_INT(ctx,
5010				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5011				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5012				    &rxr->hn_pktbuf_len, 0,
5013				    "Temporary channel packet buffer length");
5014			}
5015		}
5016	}
5017
5018	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5019	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5020	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5021#if __FreeBSD_version < 1100095
5022	    hn_rx_stat_int_sysctl,
5023#else
5024	    hn_rx_stat_u64_sysctl,
5025#endif
5026	    "LU", "LRO queued");
5027	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5028	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5029	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5030#if __FreeBSD_version < 1100095
5031	    hn_rx_stat_int_sysctl,
5032#else
5033	    hn_rx_stat_u64_sysctl,
5034#endif
5035	    "LU", "LRO flushed");
5036	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5037	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5038	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5039	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5040#if __FreeBSD_version >= 1100099
5041	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5042	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5043	    hn_lro_lenlim_sysctl, "IU",
5044	    "Max # of data bytes to be aggregated by LRO");
5045	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5046	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5047	    hn_lro_ackcnt_sysctl, "I",
5048	    "Max # of ACKs to be aggregated by LRO");
5049#endif
5050	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5051	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5052	    hn_trust_hcsum_sysctl, "I",
5053	    "Trust tcp segement verification on host side, "
5054	    "when csum info is missing");
5055	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5056	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5057	    hn_trust_hcsum_sysctl, "I",
5058	    "Trust udp datagram verification on host side, "
5059	    "when csum info is missing");
5060	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5061	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5062	    hn_trust_hcsum_sysctl, "I",
5063	    "Trust ip packet verification on host side, "
5064	    "when csum info is missing");
5065	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5066	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5067	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5068	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5069	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5070	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5071	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5072	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5073	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5074	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5075	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5076	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5077	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5078	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5079	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5080	    hn_rx_stat_ulong_sysctl, "LU",
5081	    "# of packets that we trust host's csum verification");
5082	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5083	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5084	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5085	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5086	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5087	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5088	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5089	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5090	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5091	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5092	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5093	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5094
5095	return (0);
5096}
5097
5098static void
5099hn_destroy_rx_data(struct hn_softc *sc)
5100{
5101	int i;
5102
5103	if (sc->hn_rxbuf != NULL) {
5104		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5105			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5106		else
5107			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5108		sc->hn_rxbuf = NULL;
5109	}
5110
5111	if (sc->hn_rx_ring_cnt == 0)
5112		return;
5113
5114	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5115		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5116
5117		if (rxr->hn_br == NULL)
5118			continue;
5119		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5120			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5121		} else {
5122			device_printf(sc->hn_dev,
5123			    "%dth channel bufring is referenced", i);
5124		}
5125		rxr->hn_br = NULL;
5126
5127#if defined(INET) || defined(INET6)
5128		tcp_lro_free(&rxr->hn_lro);
5129#endif
5130		free(rxr->hn_pktbuf, M_DEVBUF);
5131	}
5132	free(sc->hn_rx_ring, M_DEVBUF);
5133	sc->hn_rx_ring = NULL;
5134
5135	sc->hn_rx_ring_cnt = 0;
5136	sc->hn_rx_ring_inuse = 0;
5137}
5138
5139static int
5140hn_tx_ring_create(struct hn_softc *sc, int id)
5141{
5142	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5143	device_t dev = sc->hn_dev;
5144	bus_dma_tag_t parent_dtag;
5145	int error, i;
5146
5147	txr->hn_sc = sc;
5148	txr->hn_tx_idx = id;
5149
5150#ifndef HN_USE_TXDESC_BUFRING
5151	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5152#endif
5153	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5154
5155	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5156	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5157	    M_DEVBUF, M_WAITOK | M_ZERO);
5158#ifndef HN_USE_TXDESC_BUFRING
5159	SLIST_INIT(&txr->hn_txlist);
5160#else
5161	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5162	    M_WAITOK, &txr->hn_tx_lock);
5163#endif
5164
5165	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5166		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5167		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5168	} else {
5169		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5170	}
5171
5172#ifdef HN_IFSTART_SUPPORT
5173	if (hn_use_if_start) {
5174		txr->hn_txeof = hn_start_txeof;
5175		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5176		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5177	} else
5178#endif
5179	{
5180		int br_depth;
5181
5182		txr->hn_txeof = hn_xmit_txeof;
5183		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5184		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5185
5186		br_depth = hn_get_txswq_depth(txr);
5187		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5188		    M_WAITOK, &txr->hn_tx_lock);
5189	}
5190
5191	txr->hn_direct_tx_size = hn_direct_tx_size;
5192
5193	/*
5194	 * Always schedule transmission instead of trying to do direct
5195	 * transmission.  This one gives the best performance so far.
5196	 */
5197	txr->hn_sched_tx = 1;
5198
5199	parent_dtag = bus_get_dma_tag(dev);
5200
5201	/* DMA tag for RNDIS packet messages. */
5202	error = bus_dma_tag_create(parent_dtag, /* parent */
5203	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5204	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5205	    BUS_SPACE_MAXADDR,		/* lowaddr */
5206	    BUS_SPACE_MAXADDR,		/* highaddr */
5207	    NULL, NULL,			/* filter, filterarg */
5208	    HN_RNDIS_PKT_LEN,		/* maxsize */
5209	    1,				/* nsegments */
5210	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5211	    0,				/* flags */
5212	    NULL,			/* lockfunc */
5213	    NULL,			/* lockfuncarg */
5214	    &txr->hn_tx_rndis_dtag);
5215	if (error) {
5216		device_printf(dev, "failed to create rndis dmatag\n");
5217		return error;
5218	}
5219
5220	/* DMA tag for data. */
5221	error = bus_dma_tag_create(parent_dtag, /* parent */
5222	    1,				/* alignment */
5223	    HN_TX_DATA_BOUNDARY,	/* boundary */
5224	    BUS_SPACE_MAXADDR,		/* lowaddr */
5225	    BUS_SPACE_MAXADDR,		/* highaddr */
5226	    NULL, NULL,			/* filter, filterarg */
5227	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5228	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5229	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5230	    0,				/* flags */
5231	    NULL,			/* lockfunc */
5232	    NULL,			/* lockfuncarg */
5233	    &txr->hn_tx_data_dtag);
5234	if (error) {
5235		device_printf(dev, "failed to create data dmatag\n");
5236		return error;
5237	}
5238
5239	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5240		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5241
5242		txd->txr = txr;
5243		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5244		STAILQ_INIT(&txd->agg_list);
5245
5246		/*
5247		 * Allocate and load RNDIS packet message.
5248		 */
5249        	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5250		    (void **)&txd->rndis_pkt,
5251		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5252		    &txd->rndis_pkt_dmap);
5253		if (error) {
5254			device_printf(dev,
5255			    "failed to allocate rndis_packet_msg, %d\n", i);
5256			return error;
5257		}
5258
5259		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5260		    txd->rndis_pkt_dmap,
5261		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5262		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5263		    BUS_DMA_NOWAIT);
5264		if (error) {
5265			device_printf(dev,
5266			    "failed to load rndis_packet_msg, %d\n", i);
5267			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5268			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5269			return error;
5270		}
5271
5272		/* DMA map for TX data. */
5273		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5274		    &txd->data_dmap);
5275		if (error) {
5276			device_printf(dev,
5277			    "failed to allocate tx data dmamap\n");
5278			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5279			    txd->rndis_pkt_dmap);
5280			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5281			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5282			return error;
5283		}
5284
5285		/* All set, put it to list */
5286		txd->flags |= HN_TXD_FLAG_ONLIST;
5287#ifndef HN_USE_TXDESC_BUFRING
5288		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5289#else
5290		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5291#endif
5292	}
5293	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5294
5295	if (sc->hn_tx_sysctl_tree != NULL) {
5296		struct sysctl_oid_list *child;
5297		struct sysctl_ctx_list *ctx;
5298		char name[16];
5299
5300		/*
5301		 * Create per TX ring sysctl tree:
5302		 * dev.hn.UNIT.tx.RINGID
5303		 */
5304		ctx = device_get_sysctl_ctx(dev);
5305		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5306
5307		snprintf(name, sizeof(name), "%d", id);
5308		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5309		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5310
5311		if (txr->hn_tx_sysctl_tree != NULL) {
5312			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5313
5314#ifdef HN_DEBUG
5315			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5316			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5317			    "# of available TX descs");
5318#endif
5319#ifdef HN_IFSTART_SUPPORT
5320			if (!hn_use_if_start)
5321#endif
5322			{
5323				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5324				    CTLFLAG_RD, &txr->hn_oactive, 0,
5325				    "over active");
5326			}
5327			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5328			    CTLFLAG_RW, &txr->hn_pkts,
5329			    "# of packets transmitted");
5330			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5331			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
5332		}
5333	}
5334
5335	return 0;
5336}
5337
5338static void
5339hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5340{
5341	struct hn_tx_ring *txr = txd->txr;
5342
5343	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5344	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5345
5346	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5347	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5348	    txd->rndis_pkt_dmap);
5349	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5350}
5351
5352static void
5353hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5354{
5355
5356	KASSERT(txd->refs == 0 || txd->refs == 1,
5357	    ("invalid txd refs %d", txd->refs));
5358
5359	/* Aggregated txds will be freed by their aggregating txd. */
5360	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5361		int freed;
5362
5363		freed = hn_txdesc_put(txr, txd);
5364		KASSERT(freed, ("can't free txdesc"));
5365	}
5366}
5367
5368static void
5369hn_tx_ring_destroy(struct hn_tx_ring *txr)
5370{
5371	int i;
5372
5373	if (txr->hn_txdesc == NULL)
5374		return;
5375
5376	/*
5377	 * NOTE:
5378	 * Because the freeing of aggregated txds will be deferred
5379	 * to the aggregating txd, two passes are used here:
5380	 * - The first pass GCes any pending txds.  This GC is necessary,
5381	 *   since if the channels are revoked, hypervisor will not
5382	 *   deliver send-done for all pending txds.
5383	 * - The second pass frees the busdma stuffs, i.e. after all txds
5384	 *   were freed.
5385	 */
5386	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5387		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5388	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5389		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5390
5391	if (txr->hn_tx_data_dtag != NULL)
5392		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5393	if (txr->hn_tx_rndis_dtag != NULL)
5394		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5395
5396#ifdef HN_USE_TXDESC_BUFRING
5397	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5398#endif
5399
5400	free(txr->hn_txdesc, M_DEVBUF);
5401	txr->hn_txdesc = NULL;
5402
5403	if (txr->hn_mbuf_br != NULL)
5404		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5405
5406#ifndef HN_USE_TXDESC_BUFRING
5407	mtx_destroy(&txr->hn_txlist_spin);
5408#endif
5409	mtx_destroy(&txr->hn_tx_lock);
5410}
5411
5412static int
5413hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5414{
5415	struct sysctl_oid_list *child;
5416	struct sysctl_ctx_list *ctx;
5417	int i;
5418
5419	/*
5420	 * Create TXBUF for chimney sending.
5421	 *
5422	 * NOTE: It is shared by all channels.
5423	 */
5424	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5425	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5426	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5427	if (sc->hn_chim == NULL) {
5428		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5429		return (ENOMEM);
5430	}
5431
5432	sc->hn_tx_ring_cnt = ring_cnt;
5433	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5434
5435	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5436	    M_DEVBUF, M_WAITOK | M_ZERO);
5437
5438	ctx = device_get_sysctl_ctx(sc->hn_dev);
5439	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5440
5441	/* Create dev.hn.UNIT.tx sysctl tree */
5442	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5443	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5444
5445	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5446		int error;
5447
5448		error = hn_tx_ring_create(sc, i);
5449		if (error)
5450			return error;
5451	}
5452
5453	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5454	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5455	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5456	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5457	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5458	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5459	    __offsetof(struct hn_tx_ring, hn_send_failed),
5460	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5461	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5462	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5463	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5464	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5465	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5466	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5467	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5468	    hn_tx_stat_ulong_sysctl, "LU",
5469	    "# of packet transmission aggregation flush failure");
5470	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5471	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5472	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5473	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5474	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5475	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5476	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5477	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5478	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5479	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5480	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5481	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5482	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5483	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5484	    "# of total TX descs");
5485	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5486	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5487	    "Chimney send packet size upper boundary");
5488	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5489	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5490	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5491	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5492	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5493	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5494	    hn_tx_conf_int_sysctl, "I",
5495	    "Size of the packet for direct transmission");
5496	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5497	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5498	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5499	    hn_tx_conf_int_sysctl, "I",
5500	    "Always schedule transmission "
5501	    "instead of doing direct transmission");
5502	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5503	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5504	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5505	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5506	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5507	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5508	    "Applied packet transmission aggregation size");
5509	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5510	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5511	    hn_txagg_pktmax_sysctl, "I",
5512	    "Applied packet transmission aggregation packets");
5513	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5514	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5515	    hn_txagg_align_sysctl, "I",
5516	    "Applied packet transmission aggregation alignment");
5517
5518	return 0;
5519}
5520
5521static void
5522hn_set_chim_size(struct hn_softc *sc, int chim_size)
5523{
5524	int i;
5525
5526	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5527		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5528}
5529
5530static void
5531hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5532{
5533	struct ifnet *ifp = sc->hn_ifp;
5534	u_int hw_tsomax;
5535	int tso_minlen;
5536
5537	HN_LOCK_ASSERT(sc);
5538
5539	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5540		return;
5541
5542	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5543	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5544	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5545
5546	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5547	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5548	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5549
5550	if (tso_maxlen < tso_minlen)
5551		tso_maxlen = tso_minlen;
5552	else if (tso_maxlen > IP_MAXPACKET)
5553		tso_maxlen = IP_MAXPACKET;
5554	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5555		tso_maxlen = sc->hn_ndis_tso_szmax;
5556	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5557
5558	if (hn_xpnt_vf_isready(sc)) {
5559		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5560			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5561	}
5562	ifp->if_hw_tsomax = hw_tsomax;
5563	if (bootverbose)
5564		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5565}
5566
5567static void
5568hn_fixup_tx_data(struct hn_softc *sc)
5569{
5570	uint64_t csum_assist;
5571	int i;
5572
5573	hn_set_chim_size(sc, sc->hn_chim_szmax);
5574	if (hn_tx_chimney_size > 0 &&
5575	    hn_tx_chimney_size < sc->hn_chim_szmax)
5576		hn_set_chim_size(sc, hn_tx_chimney_size);
5577
5578	csum_assist = 0;
5579	if (sc->hn_caps & HN_CAP_IPCS)
5580		csum_assist |= CSUM_IP;
5581	if (sc->hn_caps & HN_CAP_TCP4CS)
5582		csum_assist |= CSUM_IP_TCP;
5583	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5584		csum_assist |= CSUM_IP_UDP;
5585	if (sc->hn_caps & HN_CAP_TCP6CS)
5586		csum_assist |= CSUM_IP6_TCP;
5587	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5588		csum_assist |= CSUM_IP6_UDP;
5589	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5590		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5591
5592	if (sc->hn_caps & HN_CAP_HASHVAL) {
5593		/*
5594		 * Support HASHVAL pktinfo on TX path.
5595		 */
5596		if (bootverbose)
5597			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5598		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5599			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5600	}
5601}
5602
5603static void
5604hn_fixup_rx_data(struct hn_softc *sc)
5605{
5606
5607	if (sc->hn_caps & HN_CAP_UDPHASH) {
5608		int i;
5609
5610		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5611			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5612	}
5613}
5614
5615static void
5616hn_destroy_tx_data(struct hn_softc *sc)
5617{
5618	int i;
5619
5620	if (sc->hn_chim != NULL) {
5621		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5622			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5623		} else {
5624			device_printf(sc->hn_dev,
5625			    "chimney sending buffer is referenced");
5626		}
5627		sc->hn_chim = NULL;
5628	}
5629
5630	if (sc->hn_tx_ring_cnt == 0)
5631		return;
5632
5633	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5634		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5635
5636	free(sc->hn_tx_ring, M_DEVBUF);
5637	sc->hn_tx_ring = NULL;
5638
5639	sc->hn_tx_ring_cnt = 0;
5640	sc->hn_tx_ring_inuse = 0;
5641}
5642
5643#ifdef HN_IFSTART_SUPPORT
5644
5645static void
5646hn_start_taskfunc(void *xtxr, int pending __unused)
5647{
5648	struct hn_tx_ring *txr = xtxr;
5649
5650	mtx_lock(&txr->hn_tx_lock);
5651	hn_start_locked(txr, 0);
5652	mtx_unlock(&txr->hn_tx_lock);
5653}
5654
5655static int
5656hn_start_locked(struct hn_tx_ring *txr, int len)
5657{
5658	struct hn_softc *sc = txr->hn_sc;
5659	struct ifnet *ifp = sc->hn_ifp;
5660	int sched = 0;
5661
5662	KASSERT(hn_use_if_start,
5663	    ("hn_start_locked is called, when if_start is disabled"));
5664	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5665	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5666	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5667
5668	if (__predict_false(txr->hn_suspended))
5669		return (0);
5670
5671	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5672	    IFF_DRV_RUNNING)
5673		return (0);
5674
5675	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5676		struct hn_txdesc *txd;
5677		struct mbuf *m_head;
5678		int error;
5679
5680		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5681		if (m_head == NULL)
5682			break;
5683
5684		if (len > 0 && m_head->m_pkthdr.len > len) {
5685			/*
5686			 * This sending could be time consuming; let callers
5687			 * dispatch this packet sending (and sending of any
5688			 * following up packets) to tx taskqueue.
5689			 */
5690			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5691			sched = 1;
5692			break;
5693		}
5694
5695#if defined(INET6) || defined(INET)
5696		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5697			m_head = hn_tso_fixup(m_head);
5698			if (__predict_false(m_head == NULL)) {
5699				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5700				continue;
5701			}
5702		} else if (m_head->m_pkthdr.csum_flags &
5703		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5704			m_head = hn_set_hlen(m_head);
5705			if (__predict_false(m_head == NULL)) {
5706				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5707				continue;
5708			}
5709		}
5710#endif
5711
5712		txd = hn_txdesc_get(txr);
5713		if (txd == NULL) {
5714			txr->hn_no_txdescs++;
5715			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5716			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5717			break;
5718		}
5719
5720		error = hn_encap(ifp, txr, txd, &m_head);
5721		if (error) {
5722			/* Both txd and m_head are freed */
5723			KASSERT(txr->hn_agg_txd == NULL,
5724			    ("encap failed w/ pending aggregating txdesc"));
5725			continue;
5726		}
5727
5728		if (txr->hn_agg_pktleft == 0) {
5729			if (txr->hn_agg_txd != NULL) {
5730				KASSERT(m_head == NULL,
5731				    ("pending mbuf for aggregating txdesc"));
5732				error = hn_flush_txagg(ifp, txr);
5733				if (__predict_false(error)) {
5734					atomic_set_int(&ifp->if_drv_flags,
5735					    IFF_DRV_OACTIVE);
5736					break;
5737				}
5738			} else {
5739				KASSERT(m_head != NULL, ("mbuf was freed"));
5740				error = hn_txpkt(ifp, txr, txd);
5741				if (__predict_false(error)) {
5742					/* txd is freed, but m_head is not */
5743					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5744					atomic_set_int(&ifp->if_drv_flags,
5745					    IFF_DRV_OACTIVE);
5746					break;
5747				}
5748			}
5749		}
5750#ifdef INVARIANTS
5751		else {
5752			KASSERT(txr->hn_agg_txd != NULL,
5753			    ("no aggregating txdesc"));
5754			KASSERT(m_head == NULL,
5755			    ("pending mbuf for aggregating txdesc"));
5756		}
5757#endif
5758	}
5759
5760	/* Flush pending aggerated transmission. */
5761	if (txr->hn_agg_txd != NULL)
5762		hn_flush_txagg(ifp, txr);
5763	return (sched);
5764}
5765
5766static void
5767hn_start(struct ifnet *ifp)
5768{
5769	struct hn_softc *sc = ifp->if_softc;
5770	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5771
5772	if (txr->hn_sched_tx)
5773		goto do_sched;
5774
5775	if (mtx_trylock(&txr->hn_tx_lock)) {
5776		int sched;
5777
5778		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5779		mtx_unlock(&txr->hn_tx_lock);
5780		if (!sched)
5781			return;
5782	}
5783do_sched:
5784	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5785}
5786
5787static void
5788hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5789{
5790	struct hn_tx_ring *txr = xtxr;
5791
5792	mtx_lock(&txr->hn_tx_lock);
5793	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5794	hn_start_locked(txr, 0);
5795	mtx_unlock(&txr->hn_tx_lock);
5796}
5797
5798static void
5799hn_start_txeof(struct hn_tx_ring *txr)
5800{
5801	struct hn_softc *sc = txr->hn_sc;
5802	struct ifnet *ifp = sc->hn_ifp;
5803
5804	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5805
5806	if (txr->hn_sched_tx)
5807		goto do_sched;
5808
5809	if (mtx_trylock(&txr->hn_tx_lock)) {
5810		int sched;
5811
5812		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5813		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5814		mtx_unlock(&txr->hn_tx_lock);
5815		if (sched) {
5816			taskqueue_enqueue(txr->hn_tx_taskq,
5817			    &txr->hn_tx_task);
5818		}
5819	} else {
5820do_sched:
5821		/*
5822		 * Release the OACTIVE earlier, with the hope, that
5823		 * others could catch up.  The task will clear the
5824		 * flag again with the hn_tx_lock to avoid possible
5825		 * races.
5826		 */
5827		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5828		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5829	}
5830}
5831
5832#endif	/* HN_IFSTART_SUPPORT */
5833
5834static int
5835hn_xmit(struct hn_tx_ring *txr, int len)
5836{
5837	struct hn_softc *sc = txr->hn_sc;
5838	struct ifnet *ifp = sc->hn_ifp;
5839	struct mbuf *m_head;
5840	int sched = 0;
5841
5842	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5843#ifdef HN_IFSTART_SUPPORT
5844	KASSERT(hn_use_if_start == 0,
5845	    ("hn_xmit is called, when if_start is enabled"));
5846#endif
5847	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5848
5849	if (__predict_false(txr->hn_suspended))
5850		return (0);
5851
5852	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5853		return (0);
5854
5855	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5856		struct hn_txdesc *txd;
5857		int error;
5858
5859		if (len > 0 && m_head->m_pkthdr.len > len) {
5860			/*
5861			 * This sending could be time consuming; let callers
5862			 * dispatch this packet sending (and sending of any
5863			 * following up packets) to tx taskqueue.
5864			 */
5865			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5866			sched = 1;
5867			break;
5868		}
5869
5870		txd = hn_txdesc_get(txr);
5871		if (txd == NULL) {
5872			txr->hn_no_txdescs++;
5873			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5874			txr->hn_oactive = 1;
5875			break;
5876		}
5877
5878		error = hn_encap(ifp, txr, txd, &m_head);
5879		if (error) {
5880			/* Both txd and m_head are freed; discard */
5881			KASSERT(txr->hn_agg_txd == NULL,
5882			    ("encap failed w/ pending aggregating txdesc"));
5883			drbr_advance(ifp, txr->hn_mbuf_br);
5884			continue;
5885		}
5886
5887		if (txr->hn_agg_pktleft == 0) {
5888			if (txr->hn_agg_txd != NULL) {
5889				KASSERT(m_head == NULL,
5890				    ("pending mbuf for aggregating txdesc"));
5891				error = hn_flush_txagg(ifp, txr);
5892				if (__predict_false(error)) {
5893					txr->hn_oactive = 1;
5894					break;
5895				}
5896			} else {
5897				KASSERT(m_head != NULL, ("mbuf was freed"));
5898				error = hn_txpkt(ifp, txr, txd);
5899				if (__predict_false(error)) {
5900					/* txd is freed, but m_head is not */
5901					drbr_putback(ifp, txr->hn_mbuf_br,
5902					    m_head);
5903					txr->hn_oactive = 1;
5904					break;
5905				}
5906			}
5907		}
5908#ifdef INVARIANTS
5909		else {
5910			KASSERT(txr->hn_agg_txd != NULL,
5911			    ("no aggregating txdesc"));
5912			KASSERT(m_head == NULL,
5913			    ("pending mbuf for aggregating txdesc"));
5914		}
5915#endif
5916
5917		/* Sent */
5918		drbr_advance(ifp, txr->hn_mbuf_br);
5919	}
5920
5921	/* Flush pending aggerated transmission. */
5922	if (txr->hn_agg_txd != NULL)
5923		hn_flush_txagg(ifp, txr);
5924	return (sched);
5925}
5926
5927static int
5928hn_transmit(struct ifnet *ifp, struct mbuf *m)
5929{
5930	struct hn_softc *sc = ifp->if_softc;
5931	struct hn_tx_ring *txr;
5932	int error, idx = 0;
5933
5934	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5935		struct rm_priotracker pt;
5936
5937		rm_rlock(&sc->hn_vf_lock, &pt);
5938		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5939			struct mbuf *m_bpf = NULL;
5940			int obytes, omcast;
5941
5942			obytes = m->m_pkthdr.len;
5943			if (m->m_flags & M_MCAST)
5944				omcast = 1;
5945
5946			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5947				if (bpf_peers_present(ifp->if_bpf)) {
5948					m_bpf = m_copypacket(m, M_NOWAIT);
5949					if (m_bpf == NULL) {
5950						/*
5951						 * Failed to grab a shallow
5952						 * copy; tap now.
5953						 */
5954						ETHER_BPF_MTAP(ifp, m);
5955					}
5956				}
5957			} else {
5958				ETHER_BPF_MTAP(ifp, m);
5959			}
5960
5961			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5962			rm_runlock(&sc->hn_vf_lock, &pt);
5963
5964			if (m_bpf != NULL) {
5965				if (!error)
5966					ETHER_BPF_MTAP(ifp, m_bpf);
5967				m_freem(m_bpf);
5968			}
5969
5970			if (error == ENOBUFS) {
5971				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5972			} else if (error) {
5973				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5974			} else {
5975				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5976				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5977				if (omcast) {
5978					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5979					    omcast);
5980				}
5981			}
5982			return (error);
5983		}
5984		rm_runlock(&sc->hn_vf_lock, &pt);
5985	}
5986
5987#if defined(INET6) || defined(INET)
5988	/*
5989	 * Perform TSO packet header fixup or get l2/l3 header length now,
5990	 * since packet headers should be cache-hot.
5991	 */
5992	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5993		m = hn_tso_fixup(m);
5994		if (__predict_false(m == NULL)) {
5995			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5996			return EIO;
5997		}
5998	} else if (m->m_pkthdr.csum_flags &
5999	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6000		m = hn_set_hlen(m);
6001		if (__predict_false(m == NULL)) {
6002			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6003			return EIO;
6004		}
6005	}
6006#endif
6007
6008	/*
6009	 * Select the TX ring based on flowid
6010	 */
6011	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6012#if defined(INET6) || defined(INET)
6013		int tcpsyn = 0;
6014
6015		if (m->m_pkthdr.len < 128 &&
6016		    (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6017		    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6018			m = hn_check_tcpsyn(m, &tcpsyn);
6019			if (__predict_false(m == NULL)) {
6020				if_inc_counter(ifp,
6021				    IFCOUNTER_OERRORS, 1);
6022				return (EIO);
6023			}
6024		}
6025#else
6026		const int tcpsyn = 0;
6027#endif
6028		if (tcpsyn)
6029			idx = 0;
6030		else
6031			idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6032	}
6033	txr = &sc->hn_tx_ring[idx];
6034
6035	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6036	if (error) {
6037		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6038		return error;
6039	}
6040
6041	if (txr->hn_oactive)
6042		return 0;
6043
6044	if (txr->hn_sched_tx)
6045		goto do_sched;
6046
6047	if (mtx_trylock(&txr->hn_tx_lock)) {
6048		int sched;
6049
6050		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6051		mtx_unlock(&txr->hn_tx_lock);
6052		if (!sched)
6053			return 0;
6054	}
6055do_sched:
6056	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6057	return 0;
6058}
6059
6060static void
6061hn_tx_ring_qflush(struct hn_tx_ring *txr)
6062{
6063	struct mbuf *m;
6064
6065	mtx_lock(&txr->hn_tx_lock);
6066	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6067		m_freem(m);
6068	mtx_unlock(&txr->hn_tx_lock);
6069}
6070
6071static void
6072hn_xmit_qflush(struct ifnet *ifp)
6073{
6074	struct hn_softc *sc = ifp->if_softc;
6075	struct rm_priotracker pt;
6076	int i;
6077
6078	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6079		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6080	if_qflush(ifp);
6081
6082	rm_rlock(&sc->hn_vf_lock, &pt);
6083	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6084		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6085	rm_runlock(&sc->hn_vf_lock, &pt);
6086}
6087
6088static void
6089hn_xmit_txeof(struct hn_tx_ring *txr)
6090{
6091
6092	if (txr->hn_sched_tx)
6093		goto do_sched;
6094
6095	if (mtx_trylock(&txr->hn_tx_lock)) {
6096		int sched;
6097
6098		txr->hn_oactive = 0;
6099		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6100		mtx_unlock(&txr->hn_tx_lock);
6101		if (sched) {
6102			taskqueue_enqueue(txr->hn_tx_taskq,
6103			    &txr->hn_tx_task);
6104		}
6105	} else {
6106do_sched:
6107		/*
6108		 * Release the oactive earlier, with the hope, that
6109		 * others could catch up.  The task will clear the
6110		 * oactive again with the hn_tx_lock to avoid possible
6111		 * races.
6112		 */
6113		txr->hn_oactive = 0;
6114		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6115	}
6116}
6117
6118static void
6119hn_xmit_taskfunc(void *xtxr, int pending __unused)
6120{
6121	struct hn_tx_ring *txr = xtxr;
6122
6123	mtx_lock(&txr->hn_tx_lock);
6124	hn_xmit(txr, 0);
6125	mtx_unlock(&txr->hn_tx_lock);
6126}
6127
6128static void
6129hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6130{
6131	struct hn_tx_ring *txr = xtxr;
6132
6133	mtx_lock(&txr->hn_tx_lock);
6134	txr->hn_oactive = 0;
6135	hn_xmit(txr, 0);
6136	mtx_unlock(&txr->hn_tx_lock);
6137}
6138
6139static int
6140hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6141{
6142	struct vmbus_chan_br cbr;
6143	struct hn_rx_ring *rxr;
6144	struct hn_tx_ring *txr = NULL;
6145	int idx, error;
6146
6147	idx = vmbus_chan_subidx(chan);
6148
6149	/*
6150	 * Link this channel to RX/TX ring.
6151	 */
6152	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6153	    ("invalid channel index %d, should > 0 && < %d",
6154	     idx, sc->hn_rx_ring_inuse));
6155	rxr = &sc->hn_rx_ring[idx];
6156	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6157	    ("RX ring %d already attached", idx));
6158	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6159	rxr->hn_chan = chan;
6160
6161	if (bootverbose) {
6162		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6163		    idx, vmbus_chan_id(chan));
6164	}
6165
6166	if (idx < sc->hn_tx_ring_inuse) {
6167		txr = &sc->hn_tx_ring[idx];
6168		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6169		    ("TX ring %d already attached", idx));
6170		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6171
6172		txr->hn_chan = chan;
6173		if (bootverbose) {
6174			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6175			    idx, vmbus_chan_id(chan));
6176		}
6177	}
6178
6179	/* Bind this channel to a proper CPU. */
6180	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6181
6182	/*
6183	 * Open this channel
6184	 */
6185	cbr.cbr = rxr->hn_br;
6186	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6187	cbr.cbr_txsz = HN_TXBR_SIZE;
6188	cbr.cbr_rxsz = HN_RXBR_SIZE;
6189	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6190	if (error) {
6191		if (error == EISCONN) {
6192			if_printf(sc->hn_ifp, "bufring is connected after "
6193			    "chan%u open failure\n", vmbus_chan_id(chan));
6194			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6195		} else {
6196			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6197			    vmbus_chan_id(chan), error);
6198		}
6199	}
6200	return (error);
6201}
6202
6203static void
6204hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6205{
6206	struct hn_rx_ring *rxr;
6207	int idx, error;
6208
6209	idx = vmbus_chan_subidx(chan);
6210
6211	/*
6212	 * Link this channel to RX/TX ring.
6213	 */
6214	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6215	    ("invalid channel index %d, should > 0 && < %d",
6216	     idx, sc->hn_rx_ring_inuse));
6217	rxr = &sc->hn_rx_ring[idx];
6218	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6219	    ("RX ring %d is not attached", idx));
6220	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6221
6222	if (idx < sc->hn_tx_ring_inuse) {
6223		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6224
6225		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6226		    ("TX ring %d is not attached attached", idx));
6227		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6228	}
6229
6230	/*
6231	 * Close this channel.
6232	 *
6233	 * NOTE:
6234	 * Channel closing does _not_ destroy the target channel.
6235	 */
6236	error = vmbus_chan_close_direct(chan);
6237	if (error == EISCONN) {
6238		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6239		    "after being closed\n", vmbus_chan_id(chan));
6240		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6241	} else if (error) {
6242		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6243		    vmbus_chan_id(chan), error);
6244	}
6245}
6246
6247static int
6248hn_attach_subchans(struct hn_softc *sc)
6249{
6250	struct vmbus_channel **subchans;
6251	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6252	int i, error = 0;
6253
6254	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6255
6256	/* Attach the sub-channels. */
6257	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6258	for (i = 0; i < subchan_cnt; ++i) {
6259		int error1;
6260
6261		error1 = hn_chan_attach(sc, subchans[i]);
6262		if (error1) {
6263			error = error1;
6264			/* Move on; all channels will be detached later. */
6265		}
6266	}
6267	vmbus_subchan_rel(subchans, subchan_cnt);
6268
6269	if (error) {
6270		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6271	} else {
6272		if (bootverbose) {
6273			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6274			    subchan_cnt);
6275		}
6276	}
6277	return (error);
6278}
6279
6280static void
6281hn_detach_allchans(struct hn_softc *sc)
6282{
6283	struct vmbus_channel **subchans;
6284	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6285	int i;
6286
6287	if (subchan_cnt == 0)
6288		goto back;
6289
6290	/* Detach the sub-channels. */
6291	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6292	for (i = 0; i < subchan_cnt; ++i)
6293		hn_chan_detach(sc, subchans[i]);
6294	vmbus_subchan_rel(subchans, subchan_cnt);
6295
6296back:
6297	/*
6298	 * Detach the primary channel, _after_ all sub-channels
6299	 * are detached.
6300	 */
6301	hn_chan_detach(sc, sc->hn_prichan);
6302
6303	/* Wait for sub-channels to be destroyed, if any. */
6304	vmbus_subchan_drain(sc->hn_prichan);
6305
6306#ifdef INVARIANTS
6307	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6308		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6309		    HN_RX_FLAG_ATTACHED) == 0,
6310		    ("%dth RX ring is still attached", i));
6311	}
6312	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6313		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6314		    HN_TX_FLAG_ATTACHED) == 0,
6315		    ("%dth TX ring is still attached", i));
6316	}
6317#endif
6318}
6319
6320static int
6321hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6322{
6323	struct vmbus_channel **subchans;
6324	int nchan, rxr_cnt, error;
6325
6326	nchan = *nsubch + 1;
6327	if (nchan == 1) {
6328		/*
6329		 * Multiple RX/TX rings are not requested.
6330		 */
6331		*nsubch = 0;
6332		return (0);
6333	}
6334
6335	/*
6336	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6337	 * table entries.
6338	 */
6339	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6340	if (error) {
6341		/* No RSS; this is benign. */
6342		*nsubch = 0;
6343		return (0);
6344	}
6345	if (bootverbose) {
6346		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6347		    rxr_cnt, nchan);
6348	}
6349
6350	if (nchan > rxr_cnt)
6351		nchan = rxr_cnt;
6352	if (nchan == 1) {
6353		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6354		*nsubch = 0;
6355		return (0);
6356	}
6357
6358	/*
6359	 * Allocate sub-channels from NVS.
6360	 */
6361	*nsubch = nchan - 1;
6362	error = hn_nvs_alloc_subchans(sc, nsubch);
6363	if (error || *nsubch == 0) {
6364		/* Failed to allocate sub-channels. */
6365		*nsubch = 0;
6366		return (0);
6367	}
6368
6369	/*
6370	 * Wait for all sub-channels to become ready before moving on.
6371	 */
6372	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6373	vmbus_subchan_rel(subchans, *nsubch);
6374	return (0);
6375}
6376
6377static bool
6378hn_synth_attachable(const struct hn_softc *sc)
6379{
6380	int i;
6381
6382	if (sc->hn_flags & HN_FLAG_ERRORS)
6383		return (false);
6384
6385	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6386		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6387
6388		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6389			return (false);
6390	}
6391	return (true);
6392}
6393
6394/*
6395 * Make sure that the RX filter is zero after the successful
6396 * RNDIS initialization.
6397 *
6398 * NOTE:
6399 * Under certain conditions on certain versions of Hyper-V,
6400 * the RNDIS rxfilter is _not_ zero on the hypervisor side
6401 * after the successful RNDIS initialization, which breaks
6402 * the assumption of any following code (well, it breaks the
6403 * RNDIS API contract actually).  Clear the RNDIS rxfilter
6404 * explicitly, drain packets sneaking through, and drain the
6405 * interrupt taskqueues scheduled due to the stealth packets.
6406 */
6407static void
6408hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6409{
6410
6411	hn_disable_rx(sc);
6412	hn_drain_rxtx(sc, nchan);
6413}
6414
6415static int
6416hn_synth_attach(struct hn_softc *sc, int mtu)
6417{
6418#define ATTACHED_NVS		0x0002
6419#define ATTACHED_RNDIS		0x0004
6420
6421	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6422	int error, nsubch, nchan = 1, i, rndis_inited;
6423	uint32_t old_caps, attached = 0;
6424
6425	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6426	    ("synthetic parts were attached"));
6427
6428	if (!hn_synth_attachable(sc))
6429		return (ENXIO);
6430
6431	/* Save capabilities for later verification. */
6432	old_caps = sc->hn_caps;
6433	sc->hn_caps = 0;
6434
6435	/* Clear RSS stuffs. */
6436	sc->hn_rss_ind_size = 0;
6437	sc->hn_rss_hash = 0;
6438	sc->hn_rss_hcap = 0;
6439
6440	/*
6441	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6442	 */
6443	error = hn_chan_attach(sc, sc->hn_prichan);
6444	if (error)
6445		goto failed;
6446
6447	/*
6448	 * Attach NVS.
6449	 */
6450	error = hn_nvs_attach(sc, mtu);
6451	if (error)
6452		goto failed;
6453	attached |= ATTACHED_NVS;
6454
6455	/*
6456	 * Attach RNDIS _after_ NVS is attached.
6457	 */
6458	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6459	if (rndis_inited)
6460		attached |= ATTACHED_RNDIS;
6461	if (error)
6462		goto failed;
6463
6464	/*
6465	 * Make sure capabilities are not changed.
6466	 */
6467	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6468		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6469		    old_caps, sc->hn_caps);
6470		error = ENXIO;
6471		goto failed;
6472	}
6473
6474	/*
6475	 * Allocate sub-channels for multi-TX/RX rings.
6476	 *
6477	 * NOTE:
6478	 * The # of RX rings that can be used is equivalent to the # of
6479	 * channels to be requested.
6480	 */
6481	nsubch = sc->hn_rx_ring_cnt - 1;
6482	error = hn_synth_alloc_subchans(sc, &nsubch);
6483	if (error)
6484		goto failed;
6485	/* NOTE: _Full_ synthetic parts detach is required now. */
6486	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6487
6488	/*
6489	 * Set the # of TX/RX rings that could be used according to
6490	 * the # of channels that NVS offered.
6491	 */
6492	nchan = nsubch + 1;
6493	hn_set_ring_inuse(sc, nchan);
6494	if (nchan == 1) {
6495		/* Only the primary channel can be used; done */
6496		goto back;
6497	}
6498
6499	/*
6500	 * Attach the sub-channels.
6501	 *
6502	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6503	 */
6504	error = hn_attach_subchans(sc);
6505	if (error)
6506		goto failed;
6507
6508	/*
6509	 * Configure RSS key and indirect table _after_ all sub-channels
6510	 * are attached.
6511	 */
6512	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6513		/*
6514		 * RSS key is not set yet; set it to the default RSS key.
6515		 */
6516		if (bootverbose)
6517			if_printf(sc->hn_ifp, "setup default RSS key\n");
6518		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6519		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6520	}
6521
6522	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6523		/*
6524		 * RSS indirect table is not set yet; set it up in round-
6525		 * robin fashion.
6526		 */
6527		if (bootverbose) {
6528			if_printf(sc->hn_ifp, "setup default RSS indirect "
6529			    "table\n");
6530		}
6531		for (i = 0; i < NDIS_HASH_INDCNT; ++i)
6532			rss->rss_ind[i] = i % nchan;
6533		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6534	} else {
6535		/*
6536		 * # of usable channels may be changed, so we have to
6537		 * make sure that all entries in RSS indirect table
6538		 * are valid.
6539		 *
6540		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6541		 */
6542		hn_rss_ind_fixup(sc);
6543	}
6544
6545	sc->hn_rss_hash = sc->hn_rss_hcap;
6546	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6547	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6548		/* NOTE: Don't reconfigure RSS; will do immediately. */
6549		hn_vf_rss_fixup(sc, false);
6550	}
6551	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6552	if (error)
6553		goto failed;
6554back:
6555	/*
6556	 * Fixup transmission aggregation setup.
6557	 */
6558	hn_set_txagg(sc);
6559	hn_rndis_init_fixat(sc, nchan);
6560	return (0);
6561
6562failed:
6563	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6564		hn_rndis_init_fixat(sc, nchan);
6565		hn_synth_detach(sc);
6566	} else {
6567		if (attached & ATTACHED_RNDIS) {
6568			hn_rndis_init_fixat(sc, nchan);
6569			hn_rndis_detach(sc);
6570		}
6571		if (attached & ATTACHED_NVS)
6572			hn_nvs_detach(sc);
6573		hn_chan_detach(sc, sc->hn_prichan);
6574		/* Restore old capabilities. */
6575		sc->hn_caps = old_caps;
6576	}
6577	return (error);
6578
6579#undef ATTACHED_RNDIS
6580#undef ATTACHED_NVS
6581}
6582
6583/*
6584 * NOTE:
6585 * The interface must have been suspended though hn_suspend(), before
6586 * this function get called.
6587 */
6588static void
6589hn_synth_detach(struct hn_softc *sc)
6590{
6591
6592	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6593	    ("synthetic parts were not attached"));
6594
6595	/* Detach the RNDIS first. */
6596	hn_rndis_detach(sc);
6597
6598	/* Detach NVS. */
6599	hn_nvs_detach(sc);
6600
6601	/* Detach all of the channels. */
6602	hn_detach_allchans(sc);
6603
6604	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6605}
6606
6607static void
6608hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6609{
6610	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6611	    ("invalid ring count %d", ring_cnt));
6612
6613	if (sc->hn_tx_ring_cnt > ring_cnt)
6614		sc->hn_tx_ring_inuse = ring_cnt;
6615	else
6616		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6617	sc->hn_rx_ring_inuse = ring_cnt;
6618
6619	if (bootverbose) {
6620		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6621		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6622	}
6623}
6624
6625static void
6626hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6627{
6628
6629	/*
6630	 * NOTE:
6631	 * The TX bufring will not be drained by the hypervisor,
6632	 * if the primary channel is revoked.
6633	 */
6634	while (!vmbus_chan_rx_empty(chan) ||
6635	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6636	     !vmbus_chan_tx_empty(chan)))
6637		pause("waitch", 1);
6638	vmbus_chan_intr_drain(chan);
6639}
6640
6641static void
6642hn_disable_rx(struct hn_softc *sc)
6643{
6644
6645	/*
6646	 * Disable RX by clearing RX filter forcefully.
6647	 */
6648	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6649	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6650
6651	/*
6652	 * Give RNDIS enough time to flush all pending data packets.
6653	 */
6654	pause("waitrx", (200 * hz) / 1000);
6655}
6656
6657/*
6658 * NOTE:
6659 * RX/TX _must_ have been suspended/disabled, before this function
6660 * is called.
6661 */
6662static void
6663hn_drain_rxtx(struct hn_softc *sc, int nchan)
6664{
6665	struct vmbus_channel **subch = NULL;
6666	int nsubch;
6667
6668	/*
6669	 * Drain RX/TX bufrings and interrupts.
6670	 */
6671	nsubch = nchan - 1;
6672	if (nsubch > 0)
6673		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6674
6675	if (subch != NULL) {
6676		int i;
6677
6678		for (i = 0; i < nsubch; ++i)
6679			hn_chan_drain(sc, subch[i]);
6680	}
6681	hn_chan_drain(sc, sc->hn_prichan);
6682
6683	if (subch != NULL)
6684		vmbus_subchan_rel(subch, nsubch);
6685}
6686
6687static void
6688hn_suspend_data(struct hn_softc *sc)
6689{
6690	struct hn_tx_ring *txr;
6691	int i;
6692
6693	HN_LOCK_ASSERT(sc);
6694
6695	/*
6696	 * Suspend TX.
6697	 */
6698	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6699		txr = &sc->hn_tx_ring[i];
6700
6701		mtx_lock(&txr->hn_tx_lock);
6702		txr->hn_suspended = 1;
6703		mtx_unlock(&txr->hn_tx_lock);
6704		/* No one is able send more packets now. */
6705
6706		/*
6707		 * Wait for all pending sends to finish.
6708		 *
6709		 * NOTE:
6710		 * We will _not_ receive all pending send-done, if the
6711		 * primary channel is revoked.
6712		 */
6713		while (hn_tx_ring_pending(txr) &&
6714		    !vmbus_chan_is_revoked(sc->hn_prichan))
6715			pause("hnwtx", 1 /* 1 tick */);
6716	}
6717
6718	/*
6719	 * Disable RX.
6720	 */
6721	hn_disable_rx(sc);
6722
6723	/*
6724	 * Drain RX/TX.
6725	 */
6726	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6727
6728	/*
6729	 * Drain any pending TX tasks.
6730	 *
6731	 * NOTE:
6732	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6733	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6734	 */
6735	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6736		txr = &sc->hn_tx_ring[i];
6737
6738		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6739		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6740	}
6741}
6742
6743static void
6744hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6745{
6746
6747	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6748}
6749
6750static void
6751hn_suspend_mgmt(struct hn_softc *sc)
6752{
6753	struct task task;
6754
6755	HN_LOCK_ASSERT(sc);
6756
6757	/*
6758	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6759	 * through hn_mgmt_taskq.
6760	 */
6761	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6762	vmbus_chan_run_task(sc->hn_prichan, &task);
6763
6764	/*
6765	 * Make sure that all pending management tasks are completed.
6766	 */
6767	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6768	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6769	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6770}
6771
6772static void
6773hn_suspend(struct hn_softc *sc)
6774{
6775
6776	/* Disable polling. */
6777	hn_polling(sc, 0);
6778
6779	/*
6780	 * If the non-transparent mode VF is activated, the synthetic
6781	 * device is receiving packets, so the data path of the
6782	 * synthetic device must be suspended.
6783	 */
6784	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6785	    (sc->hn_flags & HN_FLAG_RXVF))
6786		hn_suspend_data(sc);
6787	hn_suspend_mgmt(sc);
6788}
6789
6790static void
6791hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6792{
6793	int i;
6794
6795	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6796	    ("invalid TX ring count %d", tx_ring_cnt));
6797
6798	for (i = 0; i < tx_ring_cnt; ++i) {
6799		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6800
6801		mtx_lock(&txr->hn_tx_lock);
6802		txr->hn_suspended = 0;
6803		mtx_unlock(&txr->hn_tx_lock);
6804	}
6805}
6806
6807static void
6808hn_resume_data(struct hn_softc *sc)
6809{
6810	int i;
6811
6812	HN_LOCK_ASSERT(sc);
6813
6814	/*
6815	 * Re-enable RX.
6816	 */
6817	hn_rxfilter_config(sc);
6818
6819	/*
6820	 * Make sure to clear suspend status on "all" TX rings,
6821	 * since hn_tx_ring_inuse can be changed after
6822	 * hn_suspend_data().
6823	 */
6824	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6825
6826#ifdef HN_IFSTART_SUPPORT
6827	if (!hn_use_if_start)
6828#endif
6829	{
6830		/*
6831		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6832		 * reduced.
6833		 */
6834		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6835			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6836	}
6837
6838	/*
6839	 * Kick start TX.
6840	 */
6841	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6842		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6843
6844		/*
6845		 * Use txeof task, so that any pending oactive can be
6846		 * cleared properly.
6847		 */
6848		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6849	}
6850}
6851
6852static void
6853hn_resume_mgmt(struct hn_softc *sc)
6854{
6855
6856	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6857
6858	/*
6859	 * Kick off network change detection, if it was pending.
6860	 * If no network change was pending, start link status
6861	 * checks, which is more lightweight than network change
6862	 * detection.
6863	 */
6864	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6865		hn_change_network(sc);
6866	else
6867		hn_update_link_status(sc);
6868}
6869
6870static void
6871hn_resume(struct hn_softc *sc)
6872{
6873
6874	/*
6875	 * If the non-transparent mode VF is activated, the synthetic
6876	 * device have to receive packets, so the data path of the
6877	 * synthetic device must be resumed.
6878	 */
6879	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6880	    (sc->hn_flags & HN_FLAG_RXVF))
6881		hn_resume_data(sc);
6882
6883	/*
6884	 * Don't resume link status change if VF is attached/activated.
6885	 * - In the non-transparent VF mode, the synthetic device marks
6886	 *   link down until the VF is deactivated; i.e. VF is down.
6887	 * - In transparent VF mode, VF's media status is used until
6888	 *   the VF is detached.
6889	 */
6890	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6891	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6892		hn_resume_mgmt(sc);
6893
6894	/*
6895	 * Re-enable polling if this interface is running and
6896	 * the polling is requested.
6897	 */
6898	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6899		hn_polling(sc, sc->hn_pollhz);
6900}
6901
6902static void
6903hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6904{
6905	const struct rndis_status_msg *msg;
6906	int ofs;
6907
6908	if (dlen < sizeof(*msg)) {
6909		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6910		return;
6911	}
6912	msg = data;
6913
6914	switch (msg->rm_status) {
6915	case RNDIS_STATUS_MEDIA_CONNECT:
6916	case RNDIS_STATUS_MEDIA_DISCONNECT:
6917		hn_update_link_status(sc);
6918		break;
6919
6920	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6921	case RNDIS_STATUS_LINK_SPEED_CHANGE:
6922		/* Not really useful; ignore. */
6923		break;
6924
6925	case RNDIS_STATUS_NETWORK_CHANGE:
6926		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6927		if (dlen < ofs + msg->rm_stbuflen ||
6928		    msg->rm_stbuflen < sizeof(uint32_t)) {
6929			if_printf(sc->hn_ifp, "network changed\n");
6930		} else {
6931			uint32_t change;
6932
6933			memcpy(&change, ((const uint8_t *)msg) + ofs,
6934			    sizeof(change));
6935			if_printf(sc->hn_ifp, "network changed, change %u\n",
6936			    change);
6937		}
6938		hn_change_network(sc);
6939		break;
6940
6941	default:
6942		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6943		    msg->rm_status);
6944		break;
6945	}
6946}
6947
6948static int
6949hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6950{
6951	const struct rndis_pktinfo *pi = info_data;
6952	uint32_t mask = 0;
6953
6954	while (info_dlen != 0) {
6955		const void *data;
6956		uint32_t dlen;
6957
6958		if (__predict_false(info_dlen < sizeof(*pi)))
6959			return (EINVAL);
6960		if (__predict_false(info_dlen < pi->rm_size))
6961			return (EINVAL);
6962		info_dlen -= pi->rm_size;
6963
6964		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
6965			return (EINVAL);
6966		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
6967			return (EINVAL);
6968		dlen = pi->rm_size - pi->rm_pktinfooffset;
6969		data = pi->rm_data;
6970
6971		switch (pi->rm_type) {
6972		case NDIS_PKTINFO_TYPE_VLAN:
6973			if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
6974				return (EINVAL);
6975			info->vlan_info = *((const uint32_t *)data);
6976			mask |= HN_RXINFO_VLAN;
6977			break;
6978
6979		case NDIS_PKTINFO_TYPE_CSUM:
6980			if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
6981				return (EINVAL);
6982			info->csum_info = *((const uint32_t *)data);
6983			mask |= HN_RXINFO_CSUM;
6984			break;
6985
6986		case HN_NDIS_PKTINFO_TYPE_HASHVAL:
6987			if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
6988				return (EINVAL);
6989			info->hash_value = *((const uint32_t *)data);
6990			mask |= HN_RXINFO_HASHVAL;
6991			break;
6992
6993		case HN_NDIS_PKTINFO_TYPE_HASHINF:
6994			if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
6995				return (EINVAL);
6996			info->hash_info = *((const uint32_t *)data);
6997			mask |= HN_RXINFO_HASHINF;
6998			break;
6999
7000		default:
7001			goto next;
7002		}
7003
7004		if (mask == HN_RXINFO_ALL) {
7005			/* All found; done */
7006			break;
7007		}
7008next:
7009		pi = (const struct rndis_pktinfo *)
7010		    ((const uint8_t *)pi + pi->rm_size);
7011	}
7012
7013	/*
7014	 * Final fixup.
7015	 * - If there is no hash value, invalidate the hash info.
7016	 */
7017	if ((mask & HN_RXINFO_HASHVAL) == 0)
7018		info->hash_info = HN_NDIS_HASH_INFO_INVALID;
7019	return (0);
7020}
7021
7022static __inline bool
7023hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7024{
7025
7026	if (off < check_off) {
7027		if (__predict_true(off + len <= check_off))
7028			return (false);
7029	} else if (off > check_off) {
7030		if (__predict_true(check_off + check_len <= off))
7031			return (false);
7032	}
7033	return (true);
7034}
7035
7036static void
7037hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7038{
7039	const struct rndis_packet_msg *pkt;
7040	struct hn_rxinfo info;
7041	int data_off, pktinfo_off, data_len, pktinfo_len;
7042
7043	/*
7044	 * Check length.
7045	 */
7046	if (__predict_false(dlen < sizeof(*pkt))) {
7047		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7048		return;
7049	}
7050	pkt = data;
7051
7052	if (__predict_false(dlen < pkt->rm_len)) {
7053		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7054		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7055		return;
7056	}
7057	if (__predict_false(pkt->rm_len <
7058	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7059		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7060		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7061		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7062		    pkt->rm_pktinfolen);
7063		return;
7064	}
7065	if (__predict_false(pkt->rm_datalen == 0)) {
7066		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7067		return;
7068	}
7069
7070	/*
7071	 * Check offests.
7072	 */
7073#define IS_OFFSET_INVALID(ofs)			\
7074	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7075	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7076
7077	/* XXX Hyper-V does not meet data offset alignment requirement */
7078	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7079		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7080		    "data offset %u\n", pkt->rm_dataoffset);
7081		return;
7082	}
7083	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7084	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7085		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7086		    "oob offset %u\n", pkt->rm_oobdataoffset);
7087		return;
7088	}
7089	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7090	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7091		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7092		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7093		return;
7094	}
7095
7096#undef IS_OFFSET_INVALID
7097
7098	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7099	data_len = pkt->rm_datalen;
7100	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7101	pktinfo_len = pkt->rm_pktinfolen;
7102
7103	/*
7104	 * Check OOB coverage.
7105	 */
7106	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7107		int oob_off, oob_len;
7108
7109		if_printf(rxr->hn_ifp, "got oobdata\n");
7110		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7111		oob_len = pkt->rm_oobdatalen;
7112
7113		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7114			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7115			    "oob overflow, msglen %u, oob abs %d len %d\n",
7116			    pkt->rm_len, oob_off, oob_len);
7117			return;
7118		}
7119
7120		/*
7121		 * Check against data.
7122		 */
7123		if (hn_rndis_check_overlap(oob_off, oob_len,
7124		    data_off, data_len)) {
7125			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7126			    "oob overlaps data, oob abs %d len %d, "
7127			    "data abs %d len %d\n",
7128			    oob_off, oob_len, data_off, data_len);
7129			return;
7130		}
7131
7132		/*
7133		 * Check against pktinfo.
7134		 */
7135		if (pktinfo_len != 0 &&
7136		    hn_rndis_check_overlap(oob_off, oob_len,
7137		    pktinfo_off, pktinfo_len)) {
7138			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7139			    "oob overlaps pktinfo, oob abs %d len %d, "
7140			    "pktinfo abs %d len %d\n",
7141			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7142			return;
7143		}
7144	}
7145
7146	/*
7147	 * Check per-packet-info coverage and find useful per-packet-info.
7148	 */
7149	info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7150	info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7151	info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7152	if (__predict_true(pktinfo_len != 0)) {
7153		bool overlap;
7154		int error;
7155
7156		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7157			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7158			    "pktinfo overflow, msglen %u, "
7159			    "pktinfo abs %d len %d\n",
7160			    pkt->rm_len, pktinfo_off, pktinfo_len);
7161			return;
7162		}
7163
7164		/*
7165		 * Check packet info coverage.
7166		 */
7167		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7168		    data_off, data_len);
7169		if (__predict_false(overlap)) {
7170			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7171			    "pktinfo overlap data, pktinfo abs %d len %d, "
7172			    "data abs %d len %d\n",
7173			    pktinfo_off, pktinfo_len, data_off, data_len);
7174			return;
7175		}
7176
7177		/*
7178		 * Find useful per-packet-info.
7179		 */
7180		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7181		    pktinfo_len, &info);
7182		if (__predict_false(error)) {
7183			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7184			    "pktinfo\n");
7185			return;
7186		}
7187	}
7188
7189	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7190		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7191		    "data overflow, msglen %u, data abs %d len %d\n",
7192		    pkt->rm_len, data_off, data_len);
7193		return;
7194	}
7195	hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7196}
7197
7198static __inline void
7199hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7200{
7201	const struct rndis_msghdr *hdr;
7202
7203	if (__predict_false(dlen < sizeof(*hdr))) {
7204		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7205		return;
7206	}
7207	hdr = data;
7208
7209	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7210		/* Hot data path. */
7211		hn_rndis_rx_data(rxr, data, dlen);
7212		/* Done! */
7213		return;
7214	}
7215
7216	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7217		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7218	else
7219		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7220}
7221
7222static void
7223hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7224{
7225	const struct hn_nvs_hdr *hdr;
7226
7227	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7228		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7229		return;
7230	}
7231	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7232
7233	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7234		/* Useless; ignore */
7235		return;
7236	}
7237	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7238}
7239
7240static void
7241hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7242    const struct vmbus_chanpkt_hdr *pkt)
7243{
7244	struct hn_nvs_sendctx *sndc;
7245
7246	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7247	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7248	    VMBUS_CHANPKT_DATALEN(pkt));
7249	/*
7250	 * NOTE:
7251	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7252	 * its callback.
7253	 */
7254}
7255
7256static void
7257hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7258    const struct vmbus_chanpkt_hdr *pkthdr)
7259{
7260	const struct vmbus_chanpkt_rxbuf *pkt;
7261	const struct hn_nvs_hdr *nvs_hdr;
7262	int count, i, hlen;
7263
7264	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7265		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7266		return;
7267	}
7268	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7269
7270	/* Make sure that this is a RNDIS message. */
7271	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7272		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7273		    nvs_hdr->nvs_type);
7274		return;
7275	}
7276
7277	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7278	if (__predict_false(hlen < sizeof(*pkt))) {
7279		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7280		return;
7281	}
7282	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7283
7284	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7285		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7286		    pkt->cp_rxbuf_id);
7287		return;
7288	}
7289
7290	count = pkt->cp_rxbuf_cnt;
7291	if (__predict_false(hlen <
7292	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7293		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7294		return;
7295	}
7296
7297	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7298	for (i = 0; i < count; ++i) {
7299		int ofs, len;
7300
7301		ofs = pkt->cp_rxbuf[i].rb_ofs;
7302		len = pkt->cp_rxbuf[i].rb_len;
7303		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7304			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7305			    "ofs %d, len %d\n", i, ofs, len);
7306			continue;
7307		}
7308		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7309	}
7310
7311	/*
7312	 * Ack the consumed RXBUF associated w/ this channel packet,
7313	 * so that this RXBUF can be recycled by the hypervisor.
7314	 */
7315	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7316}
7317
7318static void
7319hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7320    uint64_t tid)
7321{
7322	struct hn_nvs_rndis_ack ack;
7323	int retries, error;
7324
7325	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7326	ack.nvs_status = HN_NVS_STATUS_OK;
7327
7328	retries = 0;
7329again:
7330	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7331	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7332	if (__predict_false(error == EAGAIN)) {
7333		/*
7334		 * NOTE:
7335		 * This should _not_ happen in real world, since the
7336		 * consumption of the TX bufring from the TX path is
7337		 * controlled.
7338		 */
7339		if (rxr->hn_ack_failed == 0)
7340			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7341		rxr->hn_ack_failed++;
7342		retries++;
7343		if (retries < 10) {
7344			DELAY(100);
7345			goto again;
7346		}
7347		/* RXBUF leaks! */
7348		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7349	}
7350}
7351
7352static void
7353hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7354{
7355	struct hn_rx_ring *rxr = xrxr;
7356	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7357
7358	for (;;) {
7359		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7360		int error, pktlen;
7361
7362		pktlen = rxr->hn_pktbuf_len;
7363		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7364		if (__predict_false(error == ENOBUFS)) {
7365			void *nbuf;
7366			int nlen;
7367
7368			/*
7369			 * Expand channel packet buffer.
7370			 *
7371			 * XXX
7372			 * Use M_WAITOK here, since allocation failure
7373			 * is fatal.
7374			 */
7375			nlen = rxr->hn_pktbuf_len * 2;
7376			while (nlen < pktlen)
7377				nlen *= 2;
7378			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7379
7380			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7381			    rxr->hn_pktbuf_len, nlen);
7382
7383			free(rxr->hn_pktbuf, M_DEVBUF);
7384			rxr->hn_pktbuf = nbuf;
7385			rxr->hn_pktbuf_len = nlen;
7386			/* Retry! */
7387			continue;
7388		} else if (__predict_false(error == EAGAIN)) {
7389			/* No more channel packets; done! */
7390			break;
7391		}
7392		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7393
7394		switch (pkt->cph_type) {
7395		case VMBUS_CHANPKT_TYPE_COMP:
7396			hn_nvs_handle_comp(sc, chan, pkt);
7397			break;
7398
7399		case VMBUS_CHANPKT_TYPE_RXBUF:
7400			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7401			break;
7402
7403		case VMBUS_CHANPKT_TYPE_INBAND:
7404			hn_nvs_handle_notify(sc, pkt);
7405			break;
7406
7407		default:
7408			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7409			    pkt->cph_type);
7410			break;
7411		}
7412	}
7413	hn_chan_rollup(rxr, rxr->hn_txr);
7414}
7415
7416static void
7417hn_sysinit(void *arg __unused)
7418{
7419	int i;
7420
7421	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7422
7423#ifdef HN_IFSTART_SUPPORT
7424	/*
7425	 * Don't use ifnet.if_start if transparent VF mode is requested;
7426	 * mainly due to the IFF_DRV_OACTIVE flag.
7427	 */
7428	if (hn_xpnt_vf && hn_use_if_start) {
7429		hn_use_if_start = 0;
7430		printf("hn: tranparent VF mode, if_transmit will be used, "
7431		    "instead of if_start\n");
7432	}
7433#endif
7434	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7435		printf("hn: invalid transparent VF attach routing "
7436		    "wait timeout %d, reset to %d\n",
7437		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7438		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7439	}
7440
7441	/*
7442	 * Initialize VF map.
7443	 */
7444	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7445	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7446	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7447	    M_WAITOK | M_ZERO);
7448
7449	/*
7450	 * Fix the # of TX taskqueues.
7451	 */
7452	if (hn_tx_taskq_cnt <= 0)
7453		hn_tx_taskq_cnt = 1;
7454	else if (hn_tx_taskq_cnt > mp_ncpus)
7455		hn_tx_taskq_cnt = mp_ncpus;
7456
7457	/*
7458	 * Fix the TX taskqueue mode.
7459	 */
7460	switch (hn_tx_taskq_mode) {
7461	case HN_TX_TASKQ_M_INDEP:
7462	case HN_TX_TASKQ_M_GLOBAL:
7463	case HN_TX_TASKQ_M_EVTTQ:
7464		break;
7465	default:
7466		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7467		break;
7468	}
7469
7470	if (vm_guest != VM_GUEST_HV)
7471		return;
7472
7473	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7474		return;
7475
7476	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7477	    M_DEVBUF, M_WAITOK);
7478	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7479		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7480		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7481		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7482		    "hn tx%d", i);
7483	}
7484}
7485SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7486
7487static void
7488hn_sysuninit(void *arg __unused)
7489{
7490
7491	if (hn_tx_taskque != NULL) {
7492		int i;
7493
7494		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7495			taskqueue_free(hn_tx_taskque[i]);
7496		free(hn_tx_taskque, M_DEVBUF);
7497	}
7498
7499	if (hn_vfmap != NULL)
7500		free(hn_vfmap, M_DEVBUF);
7501	rm_destroy(&hn_vfmap_lock);
7502
7503	counter_u64_free(hn_udpcs_fixup);
7504}
7505SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7506