1/*-
2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice unmodified, this list of conditions, and the following
12 *    disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*-
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 *    notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 *    notice, this list of conditions and the following disclaimer in the
40 *    documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 */
54
55#include <sys/cdefs.h>
56__FBSDID("$FreeBSD$");
57
58#include "opt_hn.h"
59#include "opt_inet6.h"
60#include "opt_inet.h"
61#include "opt_rss.h"
62
63#include <sys/param.h>
64#include <sys/systm.h>
65#include <sys/bus.h>
66#include <sys/counter.h>
67#include <sys/kernel.h>
68#include <sys/limits.h>
69#include <sys/malloc.h>
70#include <sys/mbuf.h>
71#include <sys/module.h>
72#include <sys/queue.h>
73#include <sys/lock.h>
74#include <sys/proc.h>
75#include <sys/rmlock.h>
76#include <sys/sbuf.h>
77#include <sys/sched.h>
78#include <sys/smp.h>
79#include <sys/socket.h>
80#include <sys/sockio.h>
81#include <sys/sx.h>
82#include <sys/sysctl.h>
83#include <sys/taskqueue.h>
84#include <sys/buf_ring.h>
85#include <sys/eventhandler.h>
86
87#include <machine/atomic.h>
88#include <machine/in_cksum.h>
89
90#include <net/bpf.h>
91#include <net/ethernet.h>
92#include <net/if.h>
93#include <net/if_dl.h>
94#include <net/if_media.h>
95#include <net/if_types.h>
96#include <net/if_var.h>
97#include <net/rndis.h>
98#ifdef RSS
99#include <net/rss_config.h>
100#endif
101
102#include <netinet/in_systm.h>
103#include <netinet/in.h>
104#include <netinet/ip.h>
105#include <netinet/ip6.h>
106#include <netinet/tcp.h>
107#include <netinet/tcp_lro.h>
108#include <netinet/udp.h>
109
110#include <dev/hyperv/include/hyperv.h>
111#include <dev/hyperv/include/hyperv_busdma.h>
112#include <dev/hyperv/include/vmbus.h>
113#include <dev/hyperv/include/vmbus_xact.h>
114
115#include <dev/hyperv/netvsc/ndis.h>
116#include <dev/hyperv/netvsc/if_hnreg.h>
117#include <dev/hyperv/netvsc/if_hnvar.h>
118#include <dev/hyperv/netvsc/hn_nvs.h>
119#include <dev/hyperv/netvsc/hn_rndis.h>
120
121#include "vmbus_if.h"
122
123#define HN_IFSTART_SUPPORT
124
125#define HN_RING_CNT_DEF_MAX		8
126
127#define HN_VFMAP_SIZE_DEF		8
128
129#define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
130
131/* YYY should get it from the underlying channel */
132#define HN_TX_DESC_CNT			512
133
134#define HN_RNDIS_PKT_LEN					\
135	(sizeof(struct rndis_packet_msg) +			\
136	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
137	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
138	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
139	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
140#define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
141#define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
142
143#define HN_TX_DATA_BOUNDARY		PAGE_SIZE
144#define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
145#define HN_TX_DATA_SEGSIZE		PAGE_SIZE
146/* -1 for RNDIS packet message */
147#define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
148
149#define HN_DIRECT_TX_SIZE_DEF		128
150
151#define HN_EARLY_TXEOF_THRESH		8
152
153#define HN_PKTBUF_LEN_DEF		(16 * 1024)
154
155#define HN_LROENT_CNT_DEF		128
156
157#define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
158#define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
159/* YYY 2*MTU is a bit rough, but should be good enough. */
160#define HN_LRO_LENLIM_MIN(ifp)		(2 * (ifp)->if_mtu)
161
162#define HN_LRO_ACKCNT_DEF		1
163
164#define HN_LOCK_INIT(sc)		\
165	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
166#define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
167#define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
168#define HN_LOCK(sc)					\
169do {							\
170	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
171		/* Relinquish cpu to avoid deadlock */	\
172		sched_relinquish(curthread);		\
173		DELAY(1000);				\
174	}						\
175} while (0)
176#define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
177
178#define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
179#define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
180#define HN_CSUM_IP_HWASSIST(sc)		\
181	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
182#define HN_CSUM_IP6_HWASSIST(sc)	\
183	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
184
185#define HN_PKTSIZE_MIN(align)		\
186	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
187	    HN_RNDIS_PKT_LEN, (align))
188#define HN_PKTSIZE(m, align)		\
189	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
190
191#ifdef RSS
192#define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
193#else
194#define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
195#endif
196
197struct hn_txdesc {
198#ifndef HN_USE_TXDESC_BUFRING
199	SLIST_ENTRY(hn_txdesc)		link;
200#endif
201	STAILQ_ENTRY(hn_txdesc)		agg_link;
202
203	/* Aggregated txdescs, in sending order. */
204	STAILQ_HEAD(, hn_txdesc)	agg_list;
205
206	/* The oldest packet, if transmission aggregation happens. */
207	struct mbuf			*m;
208	struct hn_tx_ring		*txr;
209	int				refs;
210	uint32_t			flags;	/* HN_TXD_FLAG_ */
211	struct hn_nvs_sendctx		send_ctx;
212	uint32_t			chim_index;
213	int				chim_size;
214
215	bus_dmamap_t			data_dmap;
216
217	bus_addr_t			rndis_pkt_paddr;
218	struct rndis_packet_msg		*rndis_pkt;
219	bus_dmamap_t			rndis_pkt_dmap;
220};
221
222#define HN_TXD_FLAG_ONLIST		0x0001
223#define HN_TXD_FLAG_DMAMAP		0x0002
224#define HN_TXD_FLAG_ONAGG		0x0004
225
226#define	HN_NDIS_PKTINFO_SUBALLOC	0x01
227#define	HN_NDIS_PKTINFO_1ST_FRAG	0x02
228#define	HN_NDIS_PKTINFO_LAST_FRAG	0x04
229
230struct packet_info_id {
231	uint8_t				ver;
232	uint8_t				flag;
233	uint16_t			pkt_id;
234};
235
236#define NDIS_PKTINFOID_SZ		sizeof(struct packet_info_id)
237
238
239struct hn_rxinfo {
240	const uint32_t			*vlan_info;
241	const uint32_t			*csum_info;
242	const uint32_t			*hash_info;
243	const uint32_t			*hash_value;
244	const struct packet_info_id	*pktinfo_id;
245};
246
247struct hn_rxvf_setarg {
248	struct hn_rx_ring	*rxr;
249	struct ifnet		*vf_ifp;
250};
251
252#define HN_RXINFO_VLAN			0x0001
253#define HN_RXINFO_CSUM			0x0002
254#define HN_RXINFO_HASHINF		0x0004
255#define HN_RXINFO_HASHVAL		0x0008
256#define HN_RXINFO_PKTINFO_ID		0x0010
257#define HN_RXINFO_ALL			\
258	(HN_RXINFO_VLAN |		\
259	 HN_RXINFO_CSUM |		\
260	 HN_RXINFO_HASHINF |		\
261	 HN_RXINFO_HASHVAL |		\
262	 HN_RXINFO_PKTINFO_ID)
263
264static int			hn_probe(device_t);
265static int			hn_attach(device_t);
266static int			hn_detach(device_t);
267static int			hn_shutdown(device_t);
268static void			hn_chan_callback(struct vmbus_channel *,
269				    void *);
270
271static void			hn_init(void *);
272static int			hn_ioctl(struct ifnet *, u_long, caddr_t);
273#ifdef HN_IFSTART_SUPPORT
274static void			hn_start(struct ifnet *);
275#endif
276static int			hn_transmit(struct ifnet *, struct mbuf *);
277static void			hn_xmit_qflush(struct ifnet *);
278static int			hn_ifmedia_upd(struct ifnet *);
279static void			hn_ifmedia_sts(struct ifnet *,
280				    struct ifmediareq *);
281
282static void			hn_ifnet_event(void *, struct ifnet *, int);
283static void			hn_ifaddr_event(void *, struct ifnet *);
284static void			hn_ifnet_attevent(void *, struct ifnet *);
285static void			hn_ifnet_detevent(void *, struct ifnet *);
286static void			hn_ifnet_lnkevent(void *, struct ifnet *, int);
287
288static bool			hn_ismyvf(const struct hn_softc *,
289				    const struct ifnet *);
290static void			hn_rxvf_change(struct hn_softc *,
291				    struct ifnet *, bool);
292static void			hn_rxvf_set(struct hn_softc *, struct ifnet *);
293static void			hn_rxvf_set_task(void *, int);
294static void			hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
295static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
296static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
297				    struct ifreq *);
298static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
299static bool			hn_xpnt_vf_isready(struct hn_softc *);
300static void			hn_xpnt_vf_setready(struct hn_softc *);
301static void			hn_xpnt_vf_init_taskfunc(void *, int);
302static void			hn_xpnt_vf_init(struct hn_softc *);
303static void			hn_xpnt_vf_setenable(struct hn_softc *);
304static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
305static void			hn_vf_rss_fixup(struct hn_softc *, bool);
306static void			hn_vf_rss_restore(struct hn_softc *);
307
308static int			hn_rndis_rxinfo(const void *, int,
309				    struct hn_rxinfo *);
310static void			hn_rndis_rx_data(struct hn_rx_ring *,
311				    const void *, int);
312static void			hn_rndis_rx_status(struct hn_softc *,
313				    const void *, int);
314static void			hn_rndis_init_fixat(struct hn_softc *, int);
315
316static void			hn_nvs_handle_notify(struct hn_softc *,
317				    const struct vmbus_chanpkt_hdr *);
318static void			hn_nvs_handle_comp(struct hn_softc *,
319				    struct vmbus_channel *,
320				    const struct vmbus_chanpkt_hdr *);
321static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
322				    struct vmbus_channel *,
323				    const struct vmbus_chanpkt_hdr *);
324static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
325				    struct vmbus_channel *, uint64_t);
326
327#if __FreeBSD_version >= 1100099
328static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
329static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
330#endif
331static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
332static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
333#if __FreeBSD_version < 1100095
334static int			hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
335#else
336static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
337#endif
338static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
339static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
340static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
341static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
342static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
343static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
344static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
345#ifndef RSS
346static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
347static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
348#endif
349static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
350static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
351static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
352static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
353static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
354static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
355static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
356static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
357static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
358static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
359static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
360static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
361static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
362static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
363
364static void			hn_stop(struct hn_softc *, bool);
365static void			hn_init_locked(struct hn_softc *);
366static int			hn_chan_attach(struct hn_softc *,
367				    struct vmbus_channel *);
368static void			hn_chan_detach(struct hn_softc *,
369				    struct vmbus_channel *);
370static int			hn_attach_subchans(struct hn_softc *);
371static void			hn_detach_allchans(struct hn_softc *);
372static void			hn_chan_rollup(struct hn_rx_ring *,
373				    struct hn_tx_ring *);
374static void			hn_set_ring_inuse(struct hn_softc *, int);
375static int			hn_synth_attach(struct hn_softc *, int);
376static void			hn_synth_detach(struct hn_softc *);
377static int			hn_synth_alloc_subchans(struct hn_softc *,
378				    int *);
379static bool			hn_synth_attachable(const struct hn_softc *);
380static void			hn_suspend(struct hn_softc *);
381static void			hn_suspend_data(struct hn_softc *);
382static void			hn_suspend_mgmt(struct hn_softc *);
383static void			hn_resume(struct hn_softc *);
384static void			hn_resume_data(struct hn_softc *);
385static void			hn_resume_mgmt(struct hn_softc *);
386static void			hn_suspend_mgmt_taskfunc(void *, int);
387static void			hn_chan_drain(struct hn_softc *,
388				    struct vmbus_channel *);
389static void			hn_disable_rx(struct hn_softc *);
390static void			hn_drain_rxtx(struct hn_softc *, int);
391static void			hn_polling(struct hn_softc *, u_int);
392static void			hn_chan_polling(struct vmbus_channel *, u_int);
393static void			hn_mtu_change_fixup(struct hn_softc *);
394
395static void			hn_update_link_status(struct hn_softc *);
396static void			hn_change_network(struct hn_softc *);
397static void			hn_link_taskfunc(void *, int);
398static void			hn_netchg_init_taskfunc(void *, int);
399static void			hn_netchg_status_taskfunc(void *, int);
400static void			hn_link_status(struct hn_softc *);
401
402static int			hn_create_rx_data(struct hn_softc *, int);
403static void			hn_destroy_rx_data(struct hn_softc *);
404static int			hn_check_iplen(const struct mbuf *, int);
405static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
406static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
407static int			hn_rxfilter_config(struct hn_softc *);
408static int			hn_rss_reconfig(struct hn_softc *);
409static void			hn_rss_ind_fixup(struct hn_softc *);
410static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
411static int			hn_rxpkt(struct hn_rx_ring *);
412static uint32_t			hn_rss_type_fromndis(uint32_t);
413static uint32_t			hn_rss_type_tondis(uint32_t);
414
415static int			hn_tx_ring_create(struct hn_softc *, int);
416static void			hn_tx_ring_destroy(struct hn_tx_ring *);
417static int			hn_create_tx_data(struct hn_softc *, int);
418static void			hn_fixup_tx_data(struct hn_softc *);
419static void			hn_fixup_rx_data(struct hn_softc *);
420static void			hn_destroy_tx_data(struct hn_softc *);
421static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
422static void			hn_txdesc_gc(struct hn_tx_ring *,
423				    struct hn_txdesc *);
424static int			hn_encap(struct ifnet *, struct hn_tx_ring *,
425				    struct hn_txdesc *, struct mbuf **);
426static int			hn_txpkt(struct ifnet *, struct hn_tx_ring *,
427				    struct hn_txdesc *);
428static void			hn_set_chim_size(struct hn_softc *, int);
429static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
430static bool			hn_tx_ring_pending(struct hn_tx_ring *);
431static void			hn_tx_ring_qflush(struct hn_tx_ring *);
432static void			hn_resume_tx(struct hn_softc *, int);
433static void			hn_set_txagg(struct hn_softc *);
434static void			*hn_try_txagg(struct ifnet *,
435				    struct hn_tx_ring *, struct hn_txdesc *,
436				    int);
437static int			hn_get_txswq_depth(const struct hn_tx_ring *);
438static void			hn_txpkt_done(struct hn_nvs_sendctx *,
439				    struct hn_softc *, struct vmbus_channel *,
440				    const void *, int);
441static int			hn_txpkt_sglist(struct hn_tx_ring *,
442				    struct hn_txdesc *);
443static int			hn_txpkt_chim(struct hn_tx_ring *,
444				    struct hn_txdesc *);
445static int			hn_xmit(struct hn_tx_ring *, int);
446static void			hn_xmit_taskfunc(void *, int);
447static void			hn_xmit_txeof(struct hn_tx_ring *);
448static void			hn_xmit_txeof_taskfunc(void *, int);
449#ifdef HN_IFSTART_SUPPORT
450static int			hn_start_locked(struct hn_tx_ring *, int);
451static void			hn_start_taskfunc(void *, int);
452static void			hn_start_txeof(struct hn_tx_ring *);
453static void			hn_start_txeof_taskfunc(void *, int);
454#endif
455
456SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
457    "Hyper-V network interface");
458
459/* Trust tcp segements verification on host side. */
460static int			hn_trust_hosttcp = 1;
461SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
462    &hn_trust_hosttcp, 0,
463    "Trust tcp segement verification on host side, "
464    "when csum info is missing (global setting)");
465
466/* Trust udp datagrams verification on host side. */
467static int			hn_trust_hostudp = 1;
468SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
469    &hn_trust_hostudp, 0,
470    "Trust udp datagram verification on host side, "
471    "when csum info is missing (global setting)");
472
473/* Trust ip packets verification on host side. */
474static int			hn_trust_hostip = 1;
475SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
476    &hn_trust_hostip, 0,
477    "Trust ip packet verification on host side, "
478    "when csum info is missing (global setting)");
479
480/*
481 * Offload UDP/IPv4 checksum.
482 */
483static int			hn_enable_udp4cs = 1;
484SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
485    &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
486
487/*
488 * Offload UDP/IPv6 checksum.
489 */
490static int			hn_enable_udp6cs = 1;
491SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
492    &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
493
494/* Stats. */
495static counter_u64_t		hn_udpcs_fixup;
496SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
497    &hn_udpcs_fixup, "# of UDP checksum fixup");
498
499/*
500 * See hn_set_hlen().
501 *
502 * This value is for Azure.  For Hyper-V, set this above
503 * 65536 to disable UDP datagram checksum fixup.
504 */
505static int			hn_udpcs_fixup_mtu = 1420;
506SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
507    &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
508
509/* Limit TSO burst size */
510static int			hn_tso_maxlen = IP_MAXPACKET;
511SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
512    &hn_tso_maxlen, 0, "TSO burst limit");
513
514/* Limit chimney send size */
515static int			hn_tx_chimney_size = 0;
516SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
517    &hn_tx_chimney_size, 0, "Chimney send packet size limit");
518
519/* Limit the size of packet for direct transmission */
520static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
521SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
522    &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
523
524/* # of LRO entries per RX ring */
525#if defined(INET) || defined(INET6)
526#if __FreeBSD_version >= 1100095
527static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
528SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
529    &hn_lro_entry_count, 0, "LRO entry count");
530#endif
531#endif
532
533static int			hn_tx_taskq_cnt = 1;
534SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
535    &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
536
537#define HN_TX_TASKQ_M_INDEP	0
538#define HN_TX_TASKQ_M_GLOBAL	1
539#define HN_TX_TASKQ_M_EVTTQ	2
540
541static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
542SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
543    &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
544    "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
545
546#ifndef HN_USE_TXDESC_BUFRING
547static int			hn_use_txdesc_bufring = 0;
548#else
549static int			hn_use_txdesc_bufring = 1;
550#endif
551SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
552    &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
553
554#ifdef HN_IFSTART_SUPPORT
555/* Use ifnet.if_start instead of ifnet.if_transmit */
556static int			hn_use_if_start = 0;
557SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
558    &hn_use_if_start, 0, "Use if_start TX method");
559#endif
560
561/* # of channels to use */
562static int			hn_chan_cnt = 0;
563SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
564    &hn_chan_cnt, 0,
565    "# of channels to use; each channel has one RX ring and one TX ring");
566
567/* # of transmit rings to use */
568static int			hn_tx_ring_cnt = 0;
569SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
570    &hn_tx_ring_cnt, 0, "# of TX rings to use");
571
572/* Software TX ring deptch */
573static int			hn_tx_swq_depth = 0;
574SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
575    &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
576
577/* Enable sorted LRO, and the depth of the per-channel mbuf queue */
578#if __FreeBSD_version >= 1100095
579static u_int			hn_lro_mbufq_depth = 0;
580SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
581    &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
582#endif
583
584/* Packet transmission aggregation size limit */
585static int			hn_tx_agg_size = -1;
586SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
587    &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
588
589/* Packet transmission aggregation count limit */
590static int			hn_tx_agg_pkts = -1;
591SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
592    &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
593
594/* VF list */
595SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
596    0, 0, hn_vflist_sysctl, "A", "VF list");
597
598/* VF mapping */
599SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
600    0, 0, hn_vfmap_sysctl, "A", "VF mapping");
601
602/* Transparent VF */
603static int			hn_xpnt_vf = 1;
604SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
605    &hn_xpnt_vf, 0, "Transparent VF mod");
606
607/* Accurate BPF support for Transparent VF */
608static int			hn_xpnt_vf_accbpf = 0;
609SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
610    &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
611
612/* Extra wait for transparent VF attach routing; unit seconds. */
613static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
614SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
615    &hn_xpnt_vf_attwait, 0,
616    "Extra wait for transparent VF attach routing; unit: seconds");
617
618static u_int			hn_cpu_index;	/* next CPU for channel */
619static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
620
621static struct rmlock		hn_vfmap_lock;
622static int			hn_vfmap_size;
623static struct ifnet		**hn_vfmap;
624
625#ifndef RSS
626static const uint8_t
627hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
628	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
629	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
630	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
631	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
632	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
633};
634#endif	/* !RSS */
635
636static const struct hyperv_guid	hn_guid = {
637	.hv_guid = {
638	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
639	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
640};
641
642static device_method_t hn_methods[] = {
643	/* Device interface */
644	DEVMETHOD(device_probe,		hn_probe),
645	DEVMETHOD(device_attach,	hn_attach),
646	DEVMETHOD(device_detach,	hn_detach),
647	DEVMETHOD(device_shutdown,	hn_shutdown),
648	DEVMETHOD_END
649};
650
651static driver_t hn_driver = {
652	"hn",
653	hn_methods,
654	sizeof(struct hn_softc)
655};
656
657static devclass_t hn_devclass;
658
659DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
660MODULE_VERSION(hn, 1);
661MODULE_DEPEND(hn, vmbus, 1, 1, 1);
662
663#if __FreeBSD_version >= 1100099
664static void
665hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
666{
667	int i;
668
669	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
670		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
671}
672#endif
673
674static int
675hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
676{
677
678	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
679	    txd->chim_size == 0, ("invalid rndis sglist txd"));
680	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
681	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
682}
683
684static int
685hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
686{
687	struct hn_nvs_rndis rndis;
688
689	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
690	    txd->chim_size > 0, ("invalid rndis chim txd"));
691
692	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
693	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
694	rndis.nvs_chim_idx = txd->chim_index;
695	rndis.nvs_chim_sz = txd->chim_size;
696
697	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
698	    &rndis, sizeof(rndis), &txd->send_ctx));
699}
700
701static __inline uint32_t
702hn_chim_alloc(struct hn_softc *sc)
703{
704	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
705	u_long *bmap = sc->hn_chim_bmap;
706	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
707
708	for (i = 0; i < bmap_cnt; ++i) {
709		int idx;
710
711		idx = ffsl(~bmap[i]);
712		if (idx == 0)
713			continue;
714
715		--idx; /* ffsl is 1-based */
716		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
717		    ("invalid i %d and idx %d", i, idx));
718
719		if (atomic_testandset_long(&bmap[i], idx))
720			continue;
721
722		ret = i * LONG_BIT + idx;
723		break;
724	}
725	return (ret);
726}
727
728static __inline void
729hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
730{
731	u_long mask;
732	uint32_t idx;
733
734	idx = chim_idx / LONG_BIT;
735	KASSERT(idx < sc->hn_chim_bmap_cnt,
736	    ("invalid chimney index 0x%x", chim_idx));
737
738	mask = 1UL << (chim_idx % LONG_BIT);
739	KASSERT(sc->hn_chim_bmap[idx] & mask,
740	    ("index bitmap 0x%lx, chimney index %u, "
741	     "bitmap idx %d, bitmask 0x%lx",
742	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
743
744	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
745}
746
747#if defined(INET6) || defined(INET)
748
749#define PULLUP_HDR(m, len)				\
750do {							\
751	if (__predict_false((m)->m_len < (len))) {	\
752		(m) = m_pullup((m), (len));		\
753		if ((m) == NULL)			\
754			return (NULL);			\
755	}						\
756} while (0)
757
758/*
759 * NOTE: If this function failed, the m_head would be freed.
760 */
761static __inline struct mbuf *
762hn_tso_fixup(struct mbuf *m_head)
763{
764	struct ether_vlan_header *evl;
765	struct tcphdr *th;
766	int ehlen;
767
768	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
769
770	PULLUP_HDR(m_head, sizeof(*evl));
771	evl = mtod(m_head, struct ether_vlan_header *);
772	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
773		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
774	else
775		ehlen = ETHER_HDR_LEN;
776	m_head->m_pkthdr.l2hlen = ehlen;
777
778#ifdef INET
779	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
780		struct ip *ip;
781		int iphlen;
782
783		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
784		ip = mtodo(m_head, ehlen);
785		iphlen = ip->ip_hl << 2;
786		m_head->m_pkthdr.l3hlen = iphlen;
787
788		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
789		th = mtodo(m_head, ehlen + iphlen);
790
791		ip->ip_len = 0;
792		ip->ip_sum = 0;
793		th->th_sum = in_pseudo(ip->ip_src.s_addr,
794		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
795	}
796#endif
797#if defined(INET6) && defined(INET)
798	else
799#endif
800#ifdef INET6
801	{
802		struct ip6_hdr *ip6;
803
804		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
805		ip6 = mtodo(m_head, ehlen);
806		if (ip6->ip6_nxt != IPPROTO_TCP) {
807			m_freem(m_head);
808			return (NULL);
809		}
810		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
811
812		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
813		th = mtodo(m_head, ehlen + sizeof(*ip6));
814
815		ip6->ip6_plen = 0;
816		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
817	}
818#endif
819	return (m_head);
820}
821
822/*
823 * NOTE: If this function failed, the m_head would be freed.
824 */
825static __inline struct mbuf *
826hn_set_hlen(struct mbuf *m_head)
827{
828	const struct ether_vlan_header *evl;
829	int ehlen;
830
831	PULLUP_HDR(m_head, sizeof(*evl));
832	evl = mtod(m_head, const struct ether_vlan_header *);
833	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
834		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
835	else
836		ehlen = ETHER_HDR_LEN;
837	m_head->m_pkthdr.l2hlen = ehlen;
838
839#ifdef INET
840	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
841		const struct ip *ip;
842		int iphlen;
843
844		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
845		ip = mtodo(m_head, ehlen);
846		iphlen = ip->ip_hl << 2;
847		m_head->m_pkthdr.l3hlen = iphlen;
848
849		/*
850		 * UDP checksum offload does not work in Azure, if the
851		 * following conditions meet:
852		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
853		 * - IP_DF is not set in the IP hdr.
854		 *
855		 * Fallback to software checksum for these UDP datagrams.
856		 */
857		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
858		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
859		    (ntohs(ip->ip_off) & IP_DF) == 0) {
860			uint16_t off = ehlen + iphlen;
861
862			counter_u64_add(hn_udpcs_fixup, 1);
863			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
864			*(uint16_t *)(m_head->m_data + off +
865                            m_head->m_pkthdr.csum_data) = in_cksum_skip(
866			    m_head, m_head->m_pkthdr.len, off);
867			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
868		}
869	}
870#endif
871#if defined(INET6) && defined(INET)
872	else
873#endif
874#ifdef INET6
875	{
876		const struct ip6_hdr *ip6;
877
878		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
879		ip6 = mtodo(m_head, ehlen);
880		if (ip6->ip6_nxt != IPPROTO_TCP &&
881		    ip6->ip6_nxt != IPPROTO_UDP) {
882			m_freem(m_head);
883			return (NULL);
884		}
885		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
886	}
887#endif
888	return (m_head);
889}
890
891/*
892 * NOTE: If this function failed, the m_head would be freed.
893 */
894static __inline struct mbuf *
895hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
896{
897	const struct tcphdr *th;
898	int ehlen, iphlen;
899
900	*tcpsyn = 0;
901	ehlen = m_head->m_pkthdr.l2hlen;
902	iphlen = m_head->m_pkthdr.l3hlen;
903
904	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
905	th = mtodo(m_head, ehlen + iphlen);
906	if (th->th_flags & TH_SYN)
907		*tcpsyn = 1;
908	return (m_head);
909}
910
911#undef PULLUP_HDR
912
913#endif	/* INET6 || INET */
914
915static int
916hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
917{
918	int error = 0;
919
920	HN_LOCK_ASSERT(sc);
921
922	if (sc->hn_rx_filter != filter) {
923		error = hn_rndis_set_rxfilter(sc, filter);
924		if (!error)
925			sc->hn_rx_filter = filter;
926	}
927	return (error);
928}
929
930static int
931hn_rxfilter_config(struct hn_softc *sc)
932{
933	struct ifnet *ifp = sc->hn_ifp;
934	uint32_t filter;
935
936	HN_LOCK_ASSERT(sc);
937
938	/*
939	 * If the non-transparent mode VF is activated, we don't know how
940	 * its RX filter is configured, so stick the synthetic device in
941	 * the promiscous mode.
942	 */
943	if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
944		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
945	} else {
946		filter = NDIS_PACKET_TYPE_DIRECTED;
947		if (ifp->if_flags & IFF_BROADCAST)
948			filter |= NDIS_PACKET_TYPE_BROADCAST;
949		/* TODO: support multicast list */
950		if ((ifp->if_flags & IFF_ALLMULTI) ||
951		    !CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
952			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
953	}
954	return (hn_set_rxfilter(sc, filter));
955}
956
957static void
958hn_set_txagg(struct hn_softc *sc)
959{
960	uint32_t size, pkts;
961	int i;
962
963	/*
964	 * Setup aggregation size.
965	 */
966	if (sc->hn_agg_size < 0)
967		size = UINT32_MAX;
968	else
969		size = sc->hn_agg_size;
970
971	if (sc->hn_rndis_agg_size < size)
972		size = sc->hn_rndis_agg_size;
973
974	/* NOTE: We only aggregate packets using chimney sending buffers. */
975	if (size > (uint32_t)sc->hn_chim_szmax)
976		size = sc->hn_chim_szmax;
977
978	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
979		/* Disable */
980		size = 0;
981		pkts = 0;
982		goto done;
983	}
984
985	/* NOTE: Type of the per TX ring setting is 'int'. */
986	if (size > INT_MAX)
987		size = INT_MAX;
988
989	/*
990	 * Setup aggregation packet count.
991	 */
992	if (sc->hn_agg_pkts < 0)
993		pkts = UINT32_MAX;
994	else
995		pkts = sc->hn_agg_pkts;
996
997	if (sc->hn_rndis_agg_pkts < pkts)
998		pkts = sc->hn_rndis_agg_pkts;
999
1000	if (pkts <= 1) {
1001		/* Disable */
1002		size = 0;
1003		pkts = 0;
1004		goto done;
1005	}
1006
1007	/* NOTE: Type of the per TX ring setting is 'short'. */
1008	if (pkts > SHRT_MAX)
1009		pkts = SHRT_MAX;
1010
1011done:
1012	/* NOTE: Type of the per TX ring setting is 'short'. */
1013	if (sc->hn_rndis_agg_align > SHRT_MAX) {
1014		/* Disable */
1015		size = 0;
1016		pkts = 0;
1017	}
1018
1019	if (bootverbose) {
1020		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1021		    size, pkts, sc->hn_rndis_agg_align);
1022	}
1023
1024	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1025		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1026
1027		mtx_lock(&txr->hn_tx_lock);
1028		txr->hn_agg_szmax = size;
1029		txr->hn_agg_pktmax = pkts;
1030		txr->hn_agg_align = sc->hn_rndis_agg_align;
1031		mtx_unlock(&txr->hn_tx_lock);
1032	}
1033}
1034
1035static int
1036hn_get_txswq_depth(const struct hn_tx_ring *txr)
1037{
1038
1039	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1040	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1041		return txr->hn_txdesc_cnt;
1042	return hn_tx_swq_depth;
1043}
1044
1045static int
1046hn_rss_reconfig(struct hn_softc *sc)
1047{
1048	int error;
1049
1050	HN_LOCK_ASSERT(sc);
1051
1052	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1053		return (ENXIO);
1054
1055	/*
1056	 * Disable RSS first.
1057	 *
1058	 * NOTE:
1059	 * Direct reconfiguration by setting the UNCHG flags does
1060	 * _not_ work properly.
1061	 */
1062	if (bootverbose)
1063		if_printf(sc->hn_ifp, "disable RSS\n");
1064	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1065	if (error) {
1066		if_printf(sc->hn_ifp, "RSS disable failed\n");
1067		return (error);
1068	}
1069
1070	/*
1071	 * Reenable the RSS w/ the updated RSS key or indirect
1072	 * table.
1073	 */
1074	if (bootverbose)
1075		if_printf(sc->hn_ifp, "reconfig RSS\n");
1076	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1077	if (error) {
1078		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1079		return (error);
1080	}
1081	return (0);
1082}
1083
1084static void
1085hn_rss_ind_fixup(struct hn_softc *sc)
1086{
1087	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1088	int i, nchan;
1089
1090	nchan = sc->hn_rx_ring_inuse;
1091	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1092
1093	/*
1094	 * Check indirect table to make sure that all channels in it
1095	 * can be used.
1096	 */
1097	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1098		if (rss->rss_ind[i] >= nchan) {
1099			if_printf(sc->hn_ifp,
1100			    "RSS indirect table %d fixup: %u -> %d\n",
1101			    i, rss->rss_ind[i], nchan - 1);
1102			rss->rss_ind[i] = nchan - 1;
1103		}
1104	}
1105}
1106
1107static int
1108hn_ifmedia_upd(struct ifnet *ifp __unused)
1109{
1110
1111	return EOPNOTSUPP;
1112}
1113
1114static void
1115hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1116{
1117	struct hn_softc *sc = ifp->if_softc;
1118
1119	ifmr->ifm_status = IFM_AVALID;
1120	ifmr->ifm_active = IFM_ETHER;
1121
1122	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1123		ifmr->ifm_active |= IFM_NONE;
1124		return;
1125	}
1126	ifmr->ifm_status |= IFM_ACTIVE;
1127	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1128}
1129
1130static void
1131hn_rxvf_set_task(void *xarg, int pending __unused)
1132{
1133	struct hn_rxvf_setarg *arg = xarg;
1134
1135	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1136}
1137
1138static void
1139hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1140{
1141	struct hn_rx_ring *rxr;
1142	struct hn_rxvf_setarg arg;
1143	struct task task;
1144	int i;
1145
1146	HN_LOCK_ASSERT(sc);
1147
1148	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1149
1150	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1151		rxr = &sc->hn_rx_ring[i];
1152
1153		if (i < sc->hn_rx_ring_inuse) {
1154			arg.rxr = rxr;
1155			arg.vf_ifp = vf_ifp;
1156			vmbus_chan_run_task(rxr->hn_chan, &task);
1157		} else {
1158			rxr->hn_rxvf_ifp = vf_ifp;
1159		}
1160	}
1161}
1162
1163static bool
1164hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1165{
1166	const struct ifnet *hn_ifp;
1167
1168	hn_ifp = sc->hn_ifp;
1169
1170	if (ifp == hn_ifp)
1171		return (false);
1172
1173	if (ifp->if_alloctype != IFT_ETHER)
1174		return (false);
1175
1176	/* Ignore lagg/vlan interfaces */
1177	if (strcmp(ifp->if_dname, "lagg") == 0 ||
1178	    strcmp(ifp->if_dname, "vlan") == 0)
1179		return (false);
1180
1181	/*
1182	 * During detach events ifp->if_addr might be NULL.
1183	 * Make sure the bcmp() below doesn't panic on that:
1184	 */
1185	if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1186		return (false);
1187
1188	if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1189		return (false);
1190
1191	return (true);
1192}
1193
1194static void
1195hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1196{
1197	struct ifnet *hn_ifp;
1198
1199	HN_LOCK(sc);
1200
1201	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1202		goto out;
1203
1204	if (!hn_ismyvf(sc, ifp))
1205		goto out;
1206	hn_ifp = sc->hn_ifp;
1207
1208	if (rxvf) {
1209		if (sc->hn_flags & HN_FLAG_RXVF)
1210			goto out;
1211
1212		sc->hn_flags |= HN_FLAG_RXVF;
1213		hn_rxfilter_config(sc);
1214	} else {
1215		if (!(sc->hn_flags & HN_FLAG_RXVF))
1216			goto out;
1217
1218		sc->hn_flags &= ~HN_FLAG_RXVF;
1219		if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1220			hn_rxfilter_config(sc);
1221		else
1222			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1223	}
1224
1225	hn_nvs_set_datapath(sc,
1226	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1227
1228	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1229
1230	if (rxvf) {
1231		hn_vf_rss_fixup(sc, true);
1232		hn_suspend_mgmt(sc);
1233		sc->hn_link_flags &=
1234		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1235		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1236	} else {
1237		hn_vf_rss_restore(sc);
1238		hn_resume_mgmt(sc);
1239	}
1240
1241	devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1242	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1243
1244	if (bootverbose) {
1245		if_printf(hn_ifp, "datapath is switched %s %s\n",
1246		    rxvf ? "to" : "from", ifp->if_xname);
1247	}
1248out:
1249	HN_UNLOCK(sc);
1250}
1251
1252static void
1253hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1254{
1255
1256	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1257		return;
1258	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1259}
1260
1261static void
1262hn_ifaddr_event(void *arg, struct ifnet *ifp)
1263{
1264
1265	hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1266}
1267
1268static int
1269hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1270{
1271	struct ifnet *ifp, *vf_ifp;
1272	uint64_t tmp;
1273	int error;
1274
1275	HN_LOCK_ASSERT(sc);
1276	ifp = sc->hn_ifp;
1277	vf_ifp = sc->hn_vf_ifp;
1278
1279	/*
1280	 * Fix up requested capabilities w/ supported capabilities,
1281	 * since the supported capabilities could have been changed.
1282	 */
1283	ifr->ifr_reqcap &= ifp->if_capabilities;
1284	/* Pass SIOCSIFCAP to VF. */
1285	error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1286
1287	/*
1288	 * NOTE:
1289	 * The error will be propagated to the callers, however, it
1290	 * is _not_ useful here.
1291	 */
1292
1293	/*
1294	 * Merge VF's enabled capabilities.
1295	 */
1296	ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1297
1298	tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1299	if (ifp->if_capenable & IFCAP_TXCSUM)
1300		ifp->if_hwassist |= tmp;
1301	else
1302		ifp->if_hwassist &= ~tmp;
1303
1304	tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1305	if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1306		ifp->if_hwassist |= tmp;
1307	else
1308		ifp->if_hwassist &= ~tmp;
1309
1310	tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1311	if (ifp->if_capenable & IFCAP_TSO4)
1312		ifp->if_hwassist |= tmp;
1313	else
1314		ifp->if_hwassist &= ~tmp;
1315
1316	tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1317	if (ifp->if_capenable & IFCAP_TSO6)
1318		ifp->if_hwassist |= tmp;
1319	else
1320		ifp->if_hwassist &= ~tmp;
1321
1322	return (error);
1323}
1324
1325static int
1326hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1327{
1328	struct ifnet *vf_ifp;
1329	struct ifreq ifr;
1330
1331	HN_LOCK_ASSERT(sc);
1332	vf_ifp = sc->hn_vf_ifp;
1333
1334	memset(&ifr, 0, sizeof(ifr));
1335	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1336	ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1337	ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1338	return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1339}
1340
1341static void
1342hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1343{
1344	struct ifnet *ifp = sc->hn_ifp;
1345	int allmulti = 0;
1346
1347	HN_LOCK_ASSERT(sc);
1348
1349	/* XXX vlan(4) style mcast addr maintenance */
1350	if (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs))
1351		allmulti = IFF_ALLMULTI;
1352
1353	/* Always set the VF's if_flags */
1354	sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1355}
1356
1357static void
1358hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1359{
1360	struct rm_priotracker pt;
1361	struct ifnet *hn_ifp = NULL;
1362	struct mbuf *mn;
1363
1364	/*
1365	 * XXX racy, if hn(4) ever detached.
1366	 */
1367	rm_rlock(&hn_vfmap_lock, &pt);
1368	if (vf_ifp->if_index < hn_vfmap_size)
1369		hn_ifp = hn_vfmap[vf_ifp->if_index];
1370	rm_runlock(&hn_vfmap_lock, &pt);
1371
1372	if (hn_ifp != NULL) {
1373		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1374			/*
1375			 * Allow tapping on the VF.
1376			 */
1377			ETHER_BPF_MTAP(vf_ifp, mn);
1378
1379			/*
1380			 * Update VF stats.
1381			 */
1382			if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1383				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1384				    mn->m_pkthdr.len);
1385			}
1386			/*
1387			 * XXX IFCOUNTER_IMCAST
1388			 * This stat updating is kinda invasive, since it
1389			 * requires two checks on the mbuf: the length check
1390			 * and the ethernet header check.  As of this write,
1391			 * all multicast packets go directly to hn(4), which
1392			 * makes imcast stat updating in the VF a try in vian.
1393			 */
1394
1395			/*
1396			 * Fix up rcvif and increase hn(4)'s ipackets.
1397			 */
1398			mn->m_pkthdr.rcvif = hn_ifp;
1399			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1400		}
1401		/*
1402		 * Go through hn(4)'s if_input.
1403		 */
1404		hn_ifp->if_input(hn_ifp, m);
1405	} else {
1406		/*
1407		 * In the middle of the transition; free this
1408		 * mbuf chain.
1409		 */
1410		while (m != NULL) {
1411			mn = m->m_nextpkt;
1412			m->m_nextpkt = NULL;
1413			m_freem(m);
1414			m = mn;
1415		}
1416	}
1417}
1418
1419static void
1420hn_mtu_change_fixup(struct hn_softc *sc)
1421{
1422	struct ifnet *ifp;
1423
1424	HN_LOCK_ASSERT(sc);
1425	ifp = sc->hn_ifp;
1426
1427	hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1428#if __FreeBSD_version >= 1100099
1429	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1430		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1431#endif
1432}
1433
1434static uint32_t
1435hn_rss_type_fromndis(uint32_t rss_hash)
1436{
1437	uint32_t types = 0;
1438
1439	if (rss_hash & NDIS_HASH_IPV4)
1440		types |= RSS_TYPE_IPV4;
1441	if (rss_hash & NDIS_HASH_TCP_IPV4)
1442		types |= RSS_TYPE_TCP_IPV4;
1443	if (rss_hash & NDIS_HASH_IPV6)
1444		types |= RSS_TYPE_IPV6;
1445	if (rss_hash & NDIS_HASH_IPV6_EX)
1446		types |= RSS_TYPE_IPV6_EX;
1447	if (rss_hash & NDIS_HASH_TCP_IPV6)
1448		types |= RSS_TYPE_TCP_IPV6;
1449	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1450		types |= RSS_TYPE_TCP_IPV6_EX;
1451	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1452		types |= RSS_TYPE_UDP_IPV4;
1453	return (types);
1454}
1455
1456static uint32_t
1457hn_rss_type_tondis(uint32_t types)
1458{
1459	uint32_t rss_hash = 0;
1460
1461	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1462	    ("UDP6 and UDP6EX are not supported"));
1463
1464	if (types & RSS_TYPE_IPV4)
1465		rss_hash |= NDIS_HASH_IPV4;
1466	if (types & RSS_TYPE_TCP_IPV4)
1467		rss_hash |= NDIS_HASH_TCP_IPV4;
1468	if (types & RSS_TYPE_IPV6)
1469		rss_hash |= NDIS_HASH_IPV6;
1470	if (types & RSS_TYPE_IPV6_EX)
1471		rss_hash |= NDIS_HASH_IPV6_EX;
1472	if (types & RSS_TYPE_TCP_IPV6)
1473		rss_hash |= NDIS_HASH_TCP_IPV6;
1474	if (types & RSS_TYPE_TCP_IPV6_EX)
1475		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1476	if (types & RSS_TYPE_UDP_IPV4)
1477		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1478	return (rss_hash);
1479}
1480
1481static void
1482hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1483{
1484	int i;
1485
1486	HN_LOCK_ASSERT(sc);
1487
1488	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1489		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1490}
1491
1492static void
1493hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1494{
1495	struct ifnet *ifp, *vf_ifp;
1496	struct ifrsshash ifrh;
1497	struct ifrsskey ifrk;
1498	int error;
1499	uint32_t my_types, diff_types, mbuf_types = 0;
1500
1501	HN_LOCK_ASSERT(sc);
1502	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1503	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1504
1505	if (sc->hn_rx_ring_inuse == 1) {
1506		/* No RSS on synthetic parts; done. */
1507		return;
1508	}
1509	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1510		/* Synthetic parts do not support Toeplitz; done. */
1511		return;
1512	}
1513
1514	ifp = sc->hn_ifp;
1515	vf_ifp = sc->hn_vf_ifp;
1516
1517	/*
1518	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1519	 * supported.
1520	 */
1521	memset(&ifrk, 0, sizeof(ifrk));
1522	strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1523	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1524	if (error) {
1525		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1526		    vf_ifp->if_xname, error);
1527		goto done;
1528	}
1529	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1530		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1531		    vf_ifp->if_xname, ifrk.ifrk_func);
1532		goto done;
1533	}
1534	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1535		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1536		    vf_ifp->if_xname, ifrk.ifrk_keylen);
1537		goto done;
1538	}
1539
1540	/*
1541	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1542	 */
1543	memset(&ifrh, 0, sizeof(ifrh));
1544	strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1545	error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1546	if (error) {
1547		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1548		    vf_ifp->if_xname, error);
1549		goto done;
1550	}
1551	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1552		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1553		    vf_ifp->if_xname, ifrh.ifrh_func);
1554		goto done;
1555	}
1556
1557	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1558	if ((ifrh.ifrh_types & my_types) == 0) {
1559		/* This disables RSS; ignore it then */
1560		if_printf(ifp, "%s intersection of RSS types failed.  "
1561		    "VF %#x, mine %#x\n", vf_ifp->if_xname,
1562		    ifrh.ifrh_types, my_types);
1563		goto done;
1564	}
1565
1566	diff_types = my_types ^ ifrh.ifrh_types;
1567	my_types &= ifrh.ifrh_types;
1568	mbuf_types = my_types;
1569
1570	/*
1571	 * Detect RSS hash value/type confliction.
1572	 *
1573	 * NOTE:
1574	 * We don't disable the hash type, but stop delivery the hash
1575	 * value/type through mbufs on RX path.
1576	 *
1577	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1578	 * hash is delivered with type of TCP_IPV4.  This means if
1579	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1580	 * least to hn_mbuf_hash.  However, given that _all_ of the
1581	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1582	 * here.
1583	 */
1584	if ((my_types & RSS_TYPE_IPV4) &&
1585	    (diff_types & ifrh.ifrh_types &
1586	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1587		/* Conflict; disable IPV4 hash type/value delivery. */
1588		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1589		mbuf_types &= ~RSS_TYPE_IPV4;
1590	}
1591	if ((my_types & RSS_TYPE_IPV6) &&
1592	    (diff_types & ifrh.ifrh_types &
1593	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1594	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1595	      RSS_TYPE_IPV6_EX))) {
1596		/* Conflict; disable IPV6 hash type/value delivery. */
1597		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1598		mbuf_types &= ~RSS_TYPE_IPV6;
1599	}
1600	if ((my_types & RSS_TYPE_IPV6_EX) &&
1601	    (diff_types & ifrh.ifrh_types &
1602	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1603	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1604	      RSS_TYPE_IPV6))) {
1605		/* Conflict; disable IPV6_EX hash type/value delivery. */
1606		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1607		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1608	}
1609	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1610	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1611		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1612		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1613		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1614	}
1615	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1616	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1617		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1618		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1619		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1620	}
1621	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1622	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1623		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1624		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1625		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1626	}
1627	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1628	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1629		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1630		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1631		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1632	}
1633
1634	/*
1635	 * Indirect table does not matter.
1636	 */
1637
1638	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1639	    hn_rss_type_tondis(my_types);
1640	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1641	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1642
1643	if (reconf) {
1644		error = hn_rss_reconfig(sc);
1645		if (error) {
1646			/* XXX roll-back? */
1647			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1648			/* XXX keep going. */
1649		}
1650	}
1651done:
1652	/* Hash deliverability for mbufs. */
1653	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1654}
1655
1656static void
1657hn_vf_rss_restore(struct hn_softc *sc)
1658{
1659
1660	HN_LOCK_ASSERT(sc);
1661	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1662	    ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1663
1664	if (sc->hn_rx_ring_inuse == 1)
1665		goto done;
1666
1667	/*
1668	 * Restore hash types.  Key does _not_ matter.
1669	 */
1670	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1671		int error;
1672
1673		sc->hn_rss_hash = sc->hn_rss_hcap;
1674		error = hn_rss_reconfig(sc);
1675		if (error) {
1676			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1677			    error);
1678			/* XXX keep going. */
1679		}
1680	}
1681done:
1682	/* Hash deliverability for mbufs. */
1683	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1684}
1685
1686static void
1687hn_xpnt_vf_setready(struct hn_softc *sc)
1688{
1689	struct ifnet *ifp, *vf_ifp;
1690	struct ifreq ifr;
1691
1692	HN_LOCK_ASSERT(sc);
1693	ifp = sc->hn_ifp;
1694	vf_ifp = sc->hn_vf_ifp;
1695
1696	/*
1697	 * Mark the VF ready.
1698	 */
1699	sc->hn_vf_rdytick = 0;
1700
1701	/*
1702	 * Save information for restoration.
1703	 */
1704	sc->hn_saved_caps = ifp->if_capabilities;
1705	sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1706	sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1707	sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1708
1709	/*
1710	 * Intersect supported/enabled capabilities.
1711	 *
1712	 * NOTE:
1713	 * if_hwassist is not changed here.
1714	 */
1715	ifp->if_capabilities &= vf_ifp->if_capabilities;
1716	ifp->if_capenable &= ifp->if_capabilities;
1717
1718	/*
1719	 * Fix TSO settings.
1720	 */
1721	if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1722		ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1723	if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1724		ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1725	if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1726		ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1727
1728	/*
1729	 * Change VF's enabled capabilities.
1730	 */
1731	memset(&ifr, 0, sizeof(ifr));
1732	strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1733	ifr.ifr_reqcap = ifp->if_capenable;
1734	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1735
1736	if (ifp->if_mtu != ETHERMTU) {
1737		int error;
1738
1739		/*
1740		 * Change VF's MTU.
1741		 */
1742		memset(&ifr, 0, sizeof(ifr));
1743		strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1744		ifr.ifr_mtu = ifp->if_mtu;
1745		error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1746		if (error) {
1747			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1748			    vf_ifp->if_xname, ifp->if_mtu);
1749			if (ifp->if_mtu > ETHERMTU) {
1750				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1751
1752				/*
1753				 * XXX
1754				 * No need to adjust the synthetic parts' MTU;
1755				 * failure of the adjustment will cause us
1756				 * infinite headache.
1757				 */
1758				ifp->if_mtu = ETHERMTU;
1759				hn_mtu_change_fixup(sc);
1760			}
1761		}
1762	}
1763}
1764
1765static bool
1766hn_xpnt_vf_isready(struct hn_softc *sc)
1767{
1768
1769	HN_LOCK_ASSERT(sc);
1770
1771	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1772		return (false);
1773
1774	if (sc->hn_vf_rdytick == 0)
1775		return (true);
1776
1777	if (sc->hn_vf_rdytick > ticks)
1778		return (false);
1779
1780	/* Mark VF as ready. */
1781	hn_xpnt_vf_setready(sc);
1782	return (true);
1783}
1784
1785static void
1786hn_xpnt_vf_setenable(struct hn_softc *sc)
1787{
1788	int i;
1789
1790	HN_LOCK_ASSERT(sc);
1791
1792	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1793	rm_wlock(&sc->hn_vf_lock);
1794	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1795	rm_wunlock(&sc->hn_vf_lock);
1796
1797	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1798		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1799}
1800
1801static void
1802hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1803{
1804	int i;
1805
1806	HN_LOCK_ASSERT(sc);
1807
1808	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1809	rm_wlock(&sc->hn_vf_lock);
1810	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1811	if (clear_vf)
1812		sc->hn_vf_ifp = NULL;
1813	rm_wunlock(&sc->hn_vf_lock);
1814
1815	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1816		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1817}
1818
1819static void
1820hn_xpnt_vf_init(struct hn_softc *sc)
1821{
1822	int error;
1823
1824	HN_LOCK_ASSERT(sc);
1825
1826	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1827	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1828
1829	if (bootverbose) {
1830		if_printf(sc->hn_ifp, "try bringing up %s\n",
1831		    sc->hn_vf_ifp->if_xname);
1832	}
1833
1834	/*
1835	 * Bring the VF up.
1836	 */
1837	hn_xpnt_vf_saveifflags(sc);
1838	sc->hn_vf_ifp->if_flags |= IFF_UP;
1839	error = hn_xpnt_vf_iocsetflags(sc);
1840	if (error) {
1841		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1842		    sc->hn_vf_ifp->if_xname, error);
1843		return;
1844	}
1845
1846	/*
1847	 * NOTE:
1848	 * Datapath setting must happen _after_ bringing the VF up.
1849	 */
1850	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1851
1852	/*
1853	 * NOTE:
1854	 * Fixup RSS related bits _after_ the VF is brought up, since
1855	 * many VFs generate RSS key during it's initialization.
1856	 */
1857	hn_vf_rss_fixup(sc, true);
1858
1859	/* Mark transparent mode VF as enabled. */
1860	hn_xpnt_vf_setenable(sc);
1861}
1862
1863static void
1864hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1865{
1866	struct hn_softc *sc = xsc;
1867
1868	HN_LOCK(sc);
1869
1870	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1871		goto done;
1872	if (sc->hn_vf_ifp == NULL)
1873		goto done;
1874	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1875		goto done;
1876
1877	if (sc->hn_vf_rdytick != 0) {
1878		/* Mark VF as ready. */
1879		hn_xpnt_vf_setready(sc);
1880	}
1881
1882	if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1883		/*
1884		 * Delayed VF initialization.
1885		 */
1886		if (bootverbose) {
1887			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1888			    sc->hn_vf_ifp->if_xname);
1889		}
1890		hn_xpnt_vf_init(sc);
1891	}
1892done:
1893	HN_UNLOCK(sc);
1894}
1895
1896static void
1897hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1898{
1899	struct hn_softc *sc = xsc;
1900
1901	HN_LOCK(sc);
1902
1903	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1904		goto done;
1905
1906	if (!hn_ismyvf(sc, ifp))
1907		goto done;
1908
1909	if (sc->hn_vf_ifp != NULL) {
1910		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1911		    sc->hn_vf_ifp->if_xname);
1912		goto done;
1913	}
1914
1915	if (hn_xpnt_vf && ifp->if_start != NULL) {
1916		/*
1917		 * ifnet.if_start is _not_ supported by transparent
1918		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1919		 */
1920		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1921		    "in transparent VF mode.\n", ifp->if_xname);
1922		goto done;
1923	}
1924
1925	rm_wlock(&hn_vfmap_lock);
1926
1927	if (ifp->if_index >= hn_vfmap_size) {
1928		struct ifnet **newmap;
1929		int newsize;
1930
1931		newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1932		newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1933		    M_WAITOK | M_ZERO);
1934
1935		memcpy(newmap, hn_vfmap,
1936		    sizeof(struct ifnet *) * hn_vfmap_size);
1937		free(hn_vfmap, M_DEVBUF);
1938		hn_vfmap = newmap;
1939		hn_vfmap_size = newsize;
1940	}
1941	KASSERT(hn_vfmap[ifp->if_index] == NULL,
1942	    ("%s: ifindex %d was mapped to %s",
1943	     ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1944	hn_vfmap[ifp->if_index] = sc->hn_ifp;
1945
1946	rm_wunlock(&hn_vfmap_lock);
1947
1948	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1949	rm_wlock(&sc->hn_vf_lock);
1950	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1951	    ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1952	sc->hn_vf_ifp = ifp;
1953	rm_wunlock(&sc->hn_vf_lock);
1954
1955	if (hn_xpnt_vf) {
1956		int wait_ticks;
1957
1958		/*
1959		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1960		 * Save vf_ifp's current if_input for later restoration.
1961		 */
1962		sc->hn_vf_input = ifp->if_input;
1963		ifp->if_input = hn_xpnt_vf_input;
1964
1965		/*
1966		 * Stop link status management; use the VF's.
1967		 */
1968		hn_suspend_mgmt(sc);
1969
1970		/*
1971		 * Give VF sometime to complete its attach routing.
1972		 */
1973		wait_ticks = hn_xpnt_vf_attwait * hz;
1974		sc->hn_vf_rdytick = ticks + wait_ticks;
1975
1976		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1977		    wait_ticks);
1978	}
1979done:
1980	HN_UNLOCK(sc);
1981}
1982
1983static void
1984hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1985{
1986	struct hn_softc *sc = xsc;
1987
1988	HN_LOCK(sc);
1989
1990	if (sc->hn_vf_ifp == NULL)
1991		goto done;
1992
1993	if (!hn_ismyvf(sc, ifp))
1994		goto done;
1995
1996	if (hn_xpnt_vf) {
1997		/*
1998		 * Make sure that the delayed initialization is not running.
1999		 *
2000		 * NOTE:
2001		 * - This lock _must_ be released, since the hn_vf_init task
2002		 *   will try holding this lock.
2003		 * - It is safe to release this lock here, since the
2004		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
2005		 *
2006		 * XXX racy, if hn(4) ever detached.
2007		 */
2008		HN_UNLOCK(sc);
2009		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
2010		HN_LOCK(sc);
2011
2012		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
2013		    sc->hn_ifp->if_xname));
2014		ifp->if_input = sc->hn_vf_input;
2015		sc->hn_vf_input = NULL;
2016
2017		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
2018		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
2019			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2020
2021		if (sc->hn_vf_rdytick == 0) {
2022			/*
2023			 * The VF was ready; restore some settings.
2024			 */
2025			sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2026			/*
2027			 * NOTE:
2028			 * There is _no_ need to fixup if_capenable and
2029			 * if_hwassist, since the if_capabilities before
2030			 * restoration was an intersection of the VF's
2031			 * if_capabilites and the synthetic device's
2032			 * if_capabilites.
2033			 */
2034			sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2035			sc->hn_ifp->if_hw_tsomaxsegcount =
2036			    sc->hn_saved_tsosegcnt;
2037			sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2038		}
2039
2040		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2041			/*
2042			 * Restore RSS settings.
2043			 */
2044			hn_vf_rss_restore(sc);
2045
2046			/*
2047			 * Resume link status management, which was suspended
2048			 * by hn_ifnet_attevent().
2049			 */
2050			hn_resume_mgmt(sc);
2051		}
2052	}
2053
2054	/* Mark transparent mode VF as disabled. */
2055	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2056
2057	rm_wlock(&hn_vfmap_lock);
2058
2059	KASSERT(ifp->if_index < hn_vfmap_size,
2060	    ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2061	if (hn_vfmap[ifp->if_index] != NULL) {
2062		KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2063		    ("%s: ifindex %d was mapped to %s",
2064		     ifp->if_xname, ifp->if_index,
2065		     hn_vfmap[ifp->if_index]->if_xname));
2066		hn_vfmap[ifp->if_index] = NULL;
2067	}
2068
2069	rm_wunlock(&hn_vfmap_lock);
2070done:
2071	HN_UNLOCK(sc);
2072}
2073
2074static void
2075hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2076{
2077	struct hn_softc *sc = xsc;
2078
2079	if (sc->hn_vf_ifp == ifp)
2080		if_link_state_change(sc->hn_ifp, link_state);
2081}
2082
2083static int
2084hn_probe(device_t dev)
2085{
2086
2087	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2088		device_set_desc(dev, "Hyper-V Network Interface");
2089		return BUS_PROBE_DEFAULT;
2090	}
2091	return ENXIO;
2092}
2093
2094static int
2095hn_attach(device_t dev)
2096{
2097	struct hn_softc *sc = device_get_softc(dev);
2098	struct sysctl_oid_list *child;
2099	struct sysctl_ctx_list *ctx;
2100	uint8_t eaddr[ETHER_ADDR_LEN];
2101	struct ifnet *ifp = NULL;
2102	int error, ring_cnt, tx_ring_cnt;
2103	uint32_t mtu;
2104
2105	sc->hn_dev = dev;
2106	sc->hn_prichan = vmbus_get_channel(dev);
2107	HN_LOCK_INIT(sc);
2108	rm_init(&sc->hn_vf_lock, "hnvf");
2109	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2110		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2111
2112	/*
2113	 * Initialize these tunables once.
2114	 */
2115	sc->hn_agg_size = hn_tx_agg_size;
2116	sc->hn_agg_pkts = hn_tx_agg_pkts;
2117
2118	/*
2119	 * Setup taskqueue for transmission.
2120	 */
2121	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2122		int i;
2123
2124		sc->hn_tx_taskqs =
2125		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2126		    M_DEVBUF, M_WAITOK);
2127		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2128			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2129			    M_WAITOK, taskqueue_thread_enqueue,
2130			    &sc->hn_tx_taskqs[i]);
2131			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2132			    "%s tx%d", device_get_nameunit(dev), i);
2133		}
2134	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2135		sc->hn_tx_taskqs = hn_tx_taskque;
2136	}
2137
2138	/*
2139	 * Setup taskqueue for mangement tasks, e.g. link status.
2140	 */
2141	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2142	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2143	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2144	    device_get_nameunit(dev));
2145	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2146	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2147	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2148	    hn_netchg_status_taskfunc, sc);
2149
2150	if (hn_xpnt_vf) {
2151		/*
2152		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2153		 */
2154		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2155		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2156		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2157		    device_get_nameunit(dev));
2158		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2159		    hn_xpnt_vf_init_taskfunc, sc);
2160	}
2161
2162	/*
2163	 * Allocate ifnet and setup its name earlier, so that if_printf
2164	 * can be used by functions, which will be called after
2165	 * ether_ifattach().
2166	 */
2167	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2168	ifp->if_softc = sc;
2169	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2170
2171	/*
2172	 * Initialize ifmedia earlier so that it can be unconditionally
2173	 * destroyed, if error happened later on.
2174	 */
2175	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2176
2177	/*
2178	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2179	 * to use (tx_ring_cnt).
2180	 *
2181	 * NOTE:
2182	 * The # of RX rings to use is same as the # of channels to use.
2183	 */
2184	ring_cnt = hn_chan_cnt;
2185	if (ring_cnt <= 0) {
2186		/* Default */
2187		ring_cnt = mp_ncpus;
2188		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2189			ring_cnt = HN_RING_CNT_DEF_MAX;
2190	} else if (ring_cnt > mp_ncpus) {
2191		ring_cnt = mp_ncpus;
2192	}
2193#ifdef RSS
2194	if (ring_cnt > rss_getnumbuckets())
2195		ring_cnt = rss_getnumbuckets();
2196#endif
2197
2198	tx_ring_cnt = hn_tx_ring_cnt;
2199	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2200		tx_ring_cnt = ring_cnt;
2201#ifdef HN_IFSTART_SUPPORT
2202	if (hn_use_if_start) {
2203		/* ifnet.if_start only needs one TX ring. */
2204		tx_ring_cnt = 1;
2205	}
2206#endif
2207
2208	/*
2209	 * Set the leader CPU for channels.
2210	 */
2211	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2212
2213	/*
2214	 * Create enough TX/RX rings, even if only limited number of
2215	 * channels can be allocated.
2216	 */
2217	error = hn_create_tx_data(sc, tx_ring_cnt);
2218	if (error)
2219		goto failed;
2220	error = hn_create_rx_data(sc, ring_cnt);
2221	if (error)
2222		goto failed;
2223
2224	/*
2225	 * Create transaction context for NVS and RNDIS transactions.
2226	 */
2227	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2228	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2229	if (sc->hn_xact == NULL) {
2230		error = ENXIO;
2231		goto failed;
2232	}
2233
2234	/*
2235	 * Install orphan handler for the revocation of this device's
2236	 * primary channel.
2237	 *
2238	 * NOTE:
2239	 * The processing order is critical here:
2240	 * Install the orphan handler, _before_ testing whether this
2241	 * device's primary channel has been revoked or not.
2242	 */
2243	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2244	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2245		error = ENXIO;
2246		goto failed;
2247	}
2248
2249	/*
2250	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2251	 */
2252	error = hn_synth_attach(sc, ETHERMTU);
2253	if (error)
2254		goto failed;
2255
2256	error = hn_rndis_get_eaddr(sc, eaddr);
2257	if (error)
2258		goto failed;
2259
2260	error = hn_rndis_get_mtu(sc, &mtu);
2261	if (error)
2262		mtu = ETHERMTU;
2263	else if (bootverbose)
2264		device_printf(dev, "RNDIS mtu %u\n", mtu);
2265
2266#if __FreeBSD_version >= 1100099
2267	if (sc->hn_rx_ring_inuse > 1) {
2268		/*
2269		 * Reduce TCP segment aggregation limit for multiple
2270		 * RX rings to increase ACK timeliness.
2271		 */
2272		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2273	}
2274#endif
2275
2276	/*
2277	 * Fixup TX/RX stuffs after synthetic parts are attached.
2278	 */
2279	hn_fixup_tx_data(sc);
2280	hn_fixup_rx_data(sc);
2281
2282	ctx = device_get_sysctl_ctx(dev);
2283	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2284	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2285	    &sc->hn_nvs_ver, 0, "NVS version");
2286	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2287	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2288	    hn_ndis_version_sysctl, "A", "NDIS version");
2289	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2290	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2291	    hn_caps_sysctl, "A", "capabilities");
2292	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2293	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2294	    hn_hwassist_sysctl, "A", "hwassist");
2295	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2296	    CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2297	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2298	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2299	    "max # of TSO segments");
2300	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2301	    CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2302	    "max size of TSO segment");
2303	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2304	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2305	    hn_rxfilter_sysctl, "A", "rxfilter");
2306	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2307	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2308	    hn_rss_hash_sysctl, "A", "RSS hash");
2309	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2310	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2311	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2312	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2313	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2314	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2315	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2316	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2317#ifndef RSS
2318	/*
2319	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2320	 */
2321	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2322	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2323	    hn_rss_key_sysctl, "IU", "RSS key");
2324	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2325	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2326	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2327#endif
2328	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2329	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2330	    "RNDIS offered packet transmission aggregation size limit");
2331	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2332	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2333	    "RNDIS offered packet transmission aggregation count limit");
2334	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2335	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2336	    "RNDIS packet transmission aggregation alignment");
2337	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2338	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2339	    hn_txagg_size_sysctl, "I",
2340	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2341	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2342	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2343	    hn_txagg_pkts_sysctl, "I",
2344	    "Packet transmission aggregation packets, "
2345	    "0 -- disable, -1 -- auto");
2346	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2347	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2348	    hn_polling_sysctl, "I",
2349	    "Polling frequency: [100,1000000], 0 disable polling");
2350	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2351	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2352	    hn_vf_sysctl, "A", "Virtual Function's name");
2353	if (!hn_xpnt_vf) {
2354		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2355		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2356		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2357	} else {
2358		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2359		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2360		    hn_xpnt_vf_enabled_sysctl, "I",
2361		    "Transparent VF enabled");
2362		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2363		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2364		    hn_xpnt_vf_accbpf_sysctl, "I",
2365		    "Accurate BPF for transparent VF");
2366	}
2367
2368	/*
2369	 * Setup the ifmedia, which has been initialized earlier.
2370	 */
2371	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2372	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2373	/* XXX ifmedia_set really should do this for us */
2374	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2375
2376	/*
2377	 * Setup the ifnet for this interface.
2378	 */
2379
2380	ifp->if_baudrate = IF_Gbps(10);
2381	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2382	ifp->if_ioctl = hn_ioctl;
2383	ifp->if_init = hn_init;
2384#ifdef HN_IFSTART_SUPPORT
2385	if (hn_use_if_start) {
2386		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2387
2388		ifp->if_start = hn_start;
2389		IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2390		ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2391		IFQ_SET_READY(&ifp->if_snd);
2392	} else
2393#endif
2394	{
2395		ifp->if_transmit = hn_transmit;
2396		ifp->if_qflush = hn_xmit_qflush;
2397	}
2398
2399	ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2400#ifdef foo
2401	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2402	ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2403#endif
2404	if (sc->hn_caps & HN_CAP_VLAN) {
2405		/* XXX not sure about VLAN_MTU. */
2406		ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2407	}
2408
2409	ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2410	if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2411		ifp->if_capabilities |= IFCAP_TXCSUM;
2412	if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2413		ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2414	if (sc->hn_caps & HN_CAP_TSO4) {
2415		ifp->if_capabilities |= IFCAP_TSO4;
2416		ifp->if_hwassist |= CSUM_IP_TSO;
2417	}
2418	if (sc->hn_caps & HN_CAP_TSO6) {
2419		ifp->if_capabilities |= IFCAP_TSO6;
2420		ifp->if_hwassist |= CSUM_IP6_TSO;
2421	}
2422
2423	/* Enable all available capabilities by default. */
2424	ifp->if_capenable = ifp->if_capabilities;
2425
2426	/*
2427	 * Disable IPv6 TSO and TXCSUM by default, they still can
2428	 * be enabled through SIOCSIFCAP.
2429	 */
2430	ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2431	ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2432
2433	if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2434		/*
2435		 * Lock hn_set_tso_maxsize() to simplify its
2436		 * internal logic.
2437		 */
2438		HN_LOCK(sc);
2439		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2440		HN_UNLOCK(sc);
2441		ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2442		ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2443	}
2444
2445	ether_ifattach(ifp, eaddr);
2446
2447	if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2448		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2449		    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2450	}
2451	if (mtu < ETHERMTU) {
2452		if_printf(ifp, "fixup mtu %u -> %u\n", ifp->if_mtu, mtu);
2453		ifp->if_mtu = mtu;
2454	}
2455
2456	/* Inform the upper layer about the long frame support. */
2457	ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2458
2459	/*
2460	 * Kick off link status check.
2461	 */
2462	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2463	hn_update_link_status(sc);
2464
2465	if (!hn_xpnt_vf) {
2466		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2467		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2468		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2469		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2470	} else {
2471		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2472		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2473	}
2474
2475	/*
2476	 * NOTE:
2477	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2478	 * since interface's LLADDR is needed; interface LLADDR is not
2479	 * available when ifnet_arrival event is triggered.
2480	 */
2481	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2482	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2483	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2484	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2485
2486	return (0);
2487failed:
2488	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2489		hn_synth_detach(sc);
2490	hn_detach(dev);
2491	return (error);
2492}
2493
2494static int
2495hn_detach(device_t dev)
2496{
2497	struct hn_softc *sc = device_get_softc(dev);
2498	struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2499
2500	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2501		/*
2502		 * In case that the vmbus missed the orphan handler
2503		 * installation.
2504		 */
2505		vmbus_xact_ctx_orphan(sc->hn_xact);
2506	}
2507
2508	if (sc->hn_ifaddr_evthand != NULL)
2509		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2510	if (sc->hn_ifnet_evthand != NULL)
2511		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2512	if (sc->hn_ifnet_atthand != NULL) {
2513		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2514		    sc->hn_ifnet_atthand);
2515	}
2516	if (sc->hn_ifnet_dethand != NULL) {
2517		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2518		    sc->hn_ifnet_dethand);
2519	}
2520	if (sc->hn_ifnet_lnkhand != NULL)
2521		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2522
2523	vf_ifp = sc->hn_vf_ifp;
2524	__compiler_membar();
2525	if (vf_ifp != NULL)
2526		hn_ifnet_detevent(sc, vf_ifp);
2527
2528	if (device_is_attached(dev)) {
2529		HN_LOCK(sc);
2530		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2531			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2532				hn_stop(sc, true);
2533			/*
2534			 * NOTE:
2535			 * hn_stop() only suspends data, so managment
2536			 * stuffs have to be suspended manually here.
2537			 */
2538			hn_suspend_mgmt(sc);
2539			hn_synth_detach(sc);
2540		}
2541		HN_UNLOCK(sc);
2542		ether_ifdetach(ifp);
2543	}
2544
2545	ifmedia_removeall(&sc->hn_media);
2546	hn_destroy_rx_data(sc);
2547	hn_destroy_tx_data(sc);
2548
2549	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2550		int i;
2551
2552		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2553			taskqueue_free(sc->hn_tx_taskqs[i]);
2554		free(sc->hn_tx_taskqs, M_DEVBUF);
2555	}
2556	taskqueue_free(sc->hn_mgmt_taskq0);
2557	if (sc->hn_vf_taskq != NULL)
2558		taskqueue_free(sc->hn_vf_taskq);
2559
2560	if (sc->hn_xact != NULL) {
2561		/*
2562		 * Uninstall the orphan handler _before_ the xact is
2563		 * destructed.
2564		 */
2565		vmbus_chan_unset_orphan(sc->hn_prichan);
2566		vmbus_xact_ctx_destroy(sc->hn_xact);
2567	}
2568
2569	if_free(ifp);
2570
2571	HN_LOCK_DESTROY(sc);
2572	rm_destroy(&sc->hn_vf_lock);
2573	return (0);
2574}
2575
2576static int
2577hn_shutdown(device_t dev)
2578{
2579
2580	return (0);
2581}
2582
2583static void
2584hn_link_status(struct hn_softc *sc)
2585{
2586	uint32_t link_status;
2587	int error;
2588
2589	error = hn_rndis_get_linkstatus(sc, &link_status);
2590	if (error) {
2591		/* XXX what to do? */
2592		return;
2593	}
2594
2595	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2596		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2597	else
2598		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2599	if_link_state_change(sc->hn_ifp,
2600	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2601	    LINK_STATE_UP : LINK_STATE_DOWN);
2602}
2603
2604static void
2605hn_link_taskfunc(void *xsc, int pending __unused)
2606{
2607	struct hn_softc *sc = xsc;
2608
2609	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2610		return;
2611	hn_link_status(sc);
2612}
2613
2614static void
2615hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2616{
2617	struct hn_softc *sc = xsc;
2618
2619	/* Prevent any link status checks from running. */
2620	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2621
2622	/*
2623	 * Fake up a [link down --> link up] state change; 5 seconds
2624	 * delay is used, which closely simulates miibus reaction
2625	 * upon link down event.
2626	 */
2627	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2628	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2629	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2630	    &sc->hn_netchg_status, 5 * hz);
2631}
2632
2633static void
2634hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2635{
2636	struct hn_softc *sc = xsc;
2637
2638	/* Re-allow link status checks. */
2639	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2640	hn_link_status(sc);
2641}
2642
2643static void
2644hn_update_link_status(struct hn_softc *sc)
2645{
2646
2647	if (sc->hn_mgmt_taskq != NULL)
2648		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2649}
2650
2651static void
2652hn_change_network(struct hn_softc *sc)
2653{
2654
2655	if (sc->hn_mgmt_taskq != NULL)
2656		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2657}
2658
2659static __inline int
2660hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2661    struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2662{
2663	struct mbuf *m = *m_head;
2664	int error;
2665
2666	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2667
2668	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2669	    m, segs, nsegs, BUS_DMA_NOWAIT);
2670	if (error == EFBIG) {
2671		struct mbuf *m_new;
2672
2673		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2674		if (m_new == NULL)
2675			return ENOBUFS;
2676		else
2677			*m_head = m = m_new;
2678		txr->hn_tx_collapsed++;
2679
2680		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2681		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2682	}
2683	if (!error) {
2684		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2685		    BUS_DMASYNC_PREWRITE);
2686		txd->flags |= HN_TXD_FLAG_DMAMAP;
2687	}
2688	return error;
2689}
2690
2691static __inline int
2692hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2693{
2694
2695	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2696	    ("put an onlist txd %#x", txd->flags));
2697	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2698	    ("put an onagg txd %#x", txd->flags));
2699
2700	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2701	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2702		return 0;
2703
2704	if (!STAILQ_EMPTY(&txd->agg_list)) {
2705		struct hn_txdesc *tmp_txd;
2706
2707		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2708			int freed;
2709
2710			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2711			    ("resursive aggregation on aggregated txdesc"));
2712			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2713			    ("not aggregated txdesc"));
2714			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2715			    ("aggregated txdesc uses dmamap"));
2716			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2717			    ("aggregated txdesc consumes "
2718			     "chimney sending buffer"));
2719			KASSERT(tmp_txd->chim_size == 0,
2720			    ("aggregated txdesc has non-zero "
2721			     "chimney sending size"));
2722
2723			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2724			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2725			freed = hn_txdesc_put(txr, tmp_txd);
2726			KASSERT(freed, ("failed to free aggregated txdesc"));
2727		}
2728	}
2729
2730	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2731		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2732		    ("chim txd uses dmamap"));
2733		hn_chim_free(txr->hn_sc, txd->chim_index);
2734		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2735		txd->chim_size = 0;
2736	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2737		bus_dmamap_sync(txr->hn_tx_data_dtag,
2738		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2739		bus_dmamap_unload(txr->hn_tx_data_dtag,
2740		    txd->data_dmap);
2741		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2742	}
2743
2744	if (txd->m != NULL) {
2745		m_freem(txd->m);
2746		txd->m = NULL;
2747	}
2748
2749	txd->flags |= HN_TXD_FLAG_ONLIST;
2750#ifndef HN_USE_TXDESC_BUFRING
2751	mtx_lock_spin(&txr->hn_txlist_spin);
2752	KASSERT(txr->hn_txdesc_avail >= 0 &&
2753	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2754	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2755	txr->hn_txdesc_avail++;
2756	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2757	mtx_unlock_spin(&txr->hn_txlist_spin);
2758#else	/* HN_USE_TXDESC_BUFRING */
2759#ifdef HN_DEBUG
2760	atomic_add_int(&txr->hn_txdesc_avail, 1);
2761#endif
2762	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2763#endif	/* !HN_USE_TXDESC_BUFRING */
2764
2765	return 1;
2766}
2767
2768static __inline struct hn_txdesc *
2769hn_txdesc_get(struct hn_tx_ring *txr)
2770{
2771	struct hn_txdesc *txd;
2772
2773#ifndef HN_USE_TXDESC_BUFRING
2774	mtx_lock_spin(&txr->hn_txlist_spin);
2775	txd = SLIST_FIRST(&txr->hn_txlist);
2776	if (txd != NULL) {
2777		KASSERT(txr->hn_txdesc_avail > 0,
2778		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2779		txr->hn_txdesc_avail--;
2780		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2781	}
2782	mtx_unlock_spin(&txr->hn_txlist_spin);
2783#else
2784	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2785#endif
2786
2787	if (txd != NULL) {
2788#ifdef HN_USE_TXDESC_BUFRING
2789#ifdef HN_DEBUG
2790		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2791#endif
2792#endif	/* HN_USE_TXDESC_BUFRING */
2793		KASSERT(txd->m == NULL && txd->refs == 0 &&
2794		    STAILQ_EMPTY(&txd->agg_list) &&
2795		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2796		    txd->chim_size == 0 &&
2797		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2798		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2799		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2800		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2801		txd->refs = 1;
2802	}
2803	return txd;
2804}
2805
2806static __inline void
2807hn_txdesc_hold(struct hn_txdesc *txd)
2808{
2809
2810	/* 0->1 transition will never work */
2811	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2812	atomic_add_int(&txd->refs, 1);
2813}
2814
2815static __inline void
2816hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2817{
2818
2819	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2820	    ("recursive aggregation on aggregating txdesc"));
2821
2822	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2823	    ("already aggregated"));
2824	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2825	    ("recursive aggregation on to-be-aggregated txdesc"));
2826
2827	txd->flags |= HN_TXD_FLAG_ONAGG;
2828	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2829}
2830
2831static bool
2832hn_tx_ring_pending(struct hn_tx_ring *txr)
2833{
2834	bool pending = false;
2835
2836#ifndef HN_USE_TXDESC_BUFRING
2837	mtx_lock_spin(&txr->hn_txlist_spin);
2838	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2839		pending = true;
2840	mtx_unlock_spin(&txr->hn_txlist_spin);
2841#else
2842	if (!buf_ring_full(txr->hn_txdesc_br))
2843		pending = true;
2844#endif
2845	return (pending);
2846}
2847
2848static __inline void
2849hn_txeof(struct hn_tx_ring *txr)
2850{
2851	txr->hn_has_txeof = 0;
2852	txr->hn_txeof(txr);
2853}
2854
2855static void
2856hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2857    struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2858{
2859	struct hn_txdesc *txd = sndc->hn_cbarg;
2860	struct hn_tx_ring *txr;
2861
2862	txr = txd->txr;
2863	KASSERT(txr->hn_chan == chan,
2864	    ("channel mismatch, on chan%u, should be chan%u",
2865	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2866
2867	txr->hn_has_txeof = 1;
2868	hn_txdesc_put(txr, txd);
2869
2870	++txr->hn_txdone_cnt;
2871	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2872		txr->hn_txdone_cnt = 0;
2873		if (txr->hn_oactive)
2874			hn_txeof(txr);
2875	}
2876}
2877
2878static void
2879hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2880{
2881#if defined(INET) || defined(INET6)
2882	tcp_lro_flush_all(&rxr->hn_lro);
2883#endif
2884
2885	/*
2886	 * NOTE:
2887	 * 'txr' could be NULL, if multiple channels and
2888	 * ifnet.if_start method are enabled.
2889	 */
2890	if (txr == NULL || !txr->hn_has_txeof)
2891		return;
2892
2893	txr->hn_txdone_cnt = 0;
2894	hn_txeof(txr);
2895}
2896
2897static __inline uint32_t
2898hn_rndis_pktmsg_offset(uint32_t ofs)
2899{
2900
2901	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2902	    ("invalid RNDIS packet msg offset %u", ofs));
2903	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2904}
2905
2906static __inline void *
2907hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2908    size_t pi_dlen, uint32_t pi_type)
2909{
2910	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2911	struct rndis_pktinfo *pi;
2912
2913	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2914	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2915
2916	/*
2917	 * Per-packet-info does not move; it only grows.
2918	 *
2919	 * NOTE:
2920	 * rm_pktinfooffset in this phase counts from the beginning
2921	 * of rndis_packet_msg.
2922	 */
2923	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2924	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2925	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2926	    pkt->rm_pktinfolen);
2927	pkt->rm_pktinfolen += pi_size;
2928
2929	pi->rm_size = pi_size;
2930	pi->rm_type = pi_type;
2931	pi->rm_internal = 0;
2932	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2933
2934	return (pi->rm_data);
2935}
2936
2937static __inline int
2938hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2939{
2940	struct hn_txdesc *txd;
2941	struct mbuf *m;
2942	int error, pkts;
2943
2944	txd = txr->hn_agg_txd;
2945	KASSERT(txd != NULL, ("no aggregate txdesc"));
2946
2947	/*
2948	 * Since hn_txpkt() will reset this temporary stat, save
2949	 * it now, so that oerrors can be updated properly, if
2950	 * hn_txpkt() ever fails.
2951	 */
2952	pkts = txr->hn_stat_pkts;
2953
2954	/*
2955	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2956	 * failure, save it for later freeing, if hn_txpkt() ever
2957	 * fails.
2958	 */
2959	m = txd->m;
2960	error = hn_txpkt(ifp, txr, txd);
2961	if (__predict_false(error)) {
2962		/* txd is freed, but m is not. */
2963		m_freem(m);
2964
2965		txr->hn_flush_failed++;
2966		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2967	}
2968
2969	/* Reset all aggregation states. */
2970	txr->hn_agg_txd = NULL;
2971	txr->hn_agg_szleft = 0;
2972	txr->hn_agg_pktleft = 0;
2973	txr->hn_agg_prevpkt = NULL;
2974
2975	return (error);
2976}
2977
2978static void *
2979hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2980    int pktsize)
2981{
2982	void *chim;
2983
2984	if (txr->hn_agg_txd != NULL) {
2985		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2986			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2987			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2988			int olen;
2989
2990			/*
2991			 * Update the previous RNDIS packet's total length,
2992			 * it can be increased due to the mandatory alignment
2993			 * padding for this RNDIS packet.  And update the
2994			 * aggregating txdesc's chimney sending buffer size
2995			 * accordingly.
2996			 *
2997			 * XXX
2998			 * Zero-out the padding, as required by the RNDIS spec.
2999			 */
3000			olen = pkt->rm_len;
3001			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
3002			agg_txd->chim_size += pkt->rm_len - olen;
3003
3004			/* Link this txdesc to the parent. */
3005			hn_txdesc_agg(agg_txd, txd);
3006
3007			chim = (uint8_t *)pkt + pkt->rm_len;
3008			/* Save the current packet for later fixup. */
3009			txr->hn_agg_prevpkt = chim;
3010
3011			txr->hn_agg_pktleft--;
3012			txr->hn_agg_szleft -= pktsize;
3013			if (txr->hn_agg_szleft <=
3014			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3015				/*
3016				 * Probably can't aggregate more packets,
3017				 * flush this aggregating txdesc proactively.
3018				 */
3019				txr->hn_agg_pktleft = 0;
3020			}
3021			/* Done! */
3022			return (chim);
3023		}
3024		hn_flush_txagg(ifp, txr);
3025	}
3026	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3027
3028	txr->hn_tx_chimney_tried++;
3029	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3030	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3031		return (NULL);
3032	txr->hn_tx_chimney++;
3033
3034	chim = txr->hn_sc->hn_chim +
3035	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3036
3037	if (txr->hn_agg_pktmax > 1 &&
3038	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3039		txr->hn_agg_txd = txd;
3040		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3041		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3042		txr->hn_agg_prevpkt = chim;
3043	}
3044	return (chim);
3045}
3046
3047/*
3048 * NOTE:
3049 * If this function fails, then both txd and m_head0 will be freed.
3050 */
3051static int
3052hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3053    struct mbuf **m_head0)
3054{
3055	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3056	int error, nsegs, i;
3057	struct mbuf *m_head = *m_head0;
3058	struct rndis_packet_msg *pkt;
3059	uint32_t *pi_data;
3060	void *chim = NULL;
3061	int pkt_hlen, pkt_size;
3062
3063	pkt = txd->rndis_pkt;
3064	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3065	if (pkt_size < txr->hn_chim_size) {
3066		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3067		if (chim != NULL)
3068			pkt = chim;
3069	} else {
3070		if (txr->hn_agg_txd != NULL)
3071			hn_flush_txagg(ifp, txr);
3072	}
3073
3074	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3075	pkt->rm_len = m_head->m_pkthdr.len;
3076	pkt->rm_dataoffset = 0;
3077	pkt->rm_datalen = m_head->m_pkthdr.len;
3078	pkt->rm_oobdataoffset = 0;
3079	pkt->rm_oobdatalen = 0;
3080	pkt->rm_oobdataelements = 0;
3081	pkt->rm_pktinfooffset = sizeof(*pkt);
3082	pkt->rm_pktinfolen = 0;
3083	pkt->rm_vchandle = 0;
3084	pkt->rm_reserved = 0;
3085
3086	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3087		/*
3088		 * Set the hash value for this packet.
3089		 */
3090		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3091		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3092
3093		if (M_HASHTYPE_ISHASH(m_head))
3094			/*
3095			 * The flowid field contains the hash value host
3096			 * set in the rx queue if it is a ip forwarding pkt.
3097			 * Set the same hash value so host can send on the
3098			 * cpu it was received.
3099			 */
3100			*pi_data = m_head->m_pkthdr.flowid;
3101		else
3102			/*
3103			 * Otherwise just put the tx queue index.
3104			 */
3105			*pi_data = txr->hn_tx_idx;
3106	}
3107
3108	if (m_head->m_flags & M_VLANTAG) {
3109		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3110		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3111		*pi_data = NDIS_VLAN_INFO_MAKE(
3112		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3113		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3114		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3115	}
3116
3117	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3118#if defined(INET6) || defined(INET)
3119		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3120		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3121#ifdef INET
3122		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3123			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3124			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3125			    m_head->m_pkthdr.tso_segsz);
3126		}
3127#endif
3128#if defined(INET6) && defined(INET)
3129		else
3130#endif
3131#ifdef INET6
3132		{
3133			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3134			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3135			    m_head->m_pkthdr.tso_segsz);
3136		}
3137#endif
3138#endif	/* INET6 || INET */
3139	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3140		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3141		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3142		if (m_head->m_pkthdr.csum_flags &
3143		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3144			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3145		} else {
3146			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3147			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3148				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3149		}
3150
3151		if (m_head->m_pkthdr.csum_flags &
3152		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3153			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3154			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3155		} else if (m_head->m_pkthdr.csum_flags &
3156		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3157			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3158			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3159		}
3160	}
3161
3162	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3163	/* Fixup RNDIS packet message total length */
3164	pkt->rm_len += pkt_hlen;
3165	/* Convert RNDIS packet message offsets */
3166	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3167	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3168
3169	/*
3170	 * Fast path: Chimney sending.
3171	 */
3172	if (chim != NULL) {
3173		struct hn_txdesc *tgt_txd = txd;
3174
3175		if (txr->hn_agg_txd != NULL) {
3176			tgt_txd = txr->hn_agg_txd;
3177#ifdef INVARIANTS
3178			*m_head0 = NULL;
3179#endif
3180		}
3181
3182		KASSERT(pkt == chim,
3183		    ("RNDIS pkt not in chimney sending buffer"));
3184		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3185		    ("chimney sending buffer is not used"));
3186		tgt_txd->chim_size += pkt->rm_len;
3187
3188		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3189		    ((uint8_t *)chim) + pkt_hlen);
3190
3191		txr->hn_gpa_cnt = 0;
3192		txr->hn_sendpkt = hn_txpkt_chim;
3193		goto done;
3194	}
3195
3196	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3197	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3198	    ("chimney buffer is used"));
3199	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3200
3201	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3202	if (__predict_false(error)) {
3203		int freed;
3204
3205		/*
3206		 * This mbuf is not linked w/ the txd yet, so free it now.
3207		 */
3208		m_freem(m_head);
3209		*m_head0 = NULL;
3210
3211		freed = hn_txdesc_put(txr, txd);
3212		KASSERT(freed != 0,
3213		    ("fail to free txd upon txdma error"));
3214
3215		txr->hn_txdma_failed++;
3216		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3217		return error;
3218	}
3219	*m_head0 = m_head;
3220
3221	/* +1 RNDIS packet message */
3222	txr->hn_gpa_cnt = nsegs + 1;
3223
3224	/* send packet with page buffer */
3225	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3226	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3227	txr->hn_gpa[0].gpa_len = pkt_hlen;
3228
3229	/*
3230	 * Fill the page buffers with mbuf info after the page
3231	 * buffer for RNDIS packet message.
3232	 */
3233	for (i = 0; i < nsegs; ++i) {
3234		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3235
3236		gpa->gpa_page = atop(segs[i].ds_addr);
3237		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3238		gpa->gpa_len = segs[i].ds_len;
3239	}
3240
3241	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3242	txd->chim_size = 0;
3243	txr->hn_sendpkt = hn_txpkt_sglist;
3244done:
3245	txd->m = m_head;
3246
3247	/* Set the completion routine */
3248	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3249
3250	/* Update temporary stats for later use. */
3251	txr->hn_stat_pkts++;
3252	txr->hn_stat_size += m_head->m_pkthdr.len;
3253	if (m_head->m_flags & M_MCAST)
3254		txr->hn_stat_mcasts++;
3255
3256	return 0;
3257}
3258
3259/*
3260 * NOTE:
3261 * If this function fails, then txd will be freed, but the mbuf
3262 * associated w/ the txd will _not_ be freed.
3263 */
3264static int
3265hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3266{
3267	int error, send_failed = 0, has_bpf;
3268
3269again:
3270	has_bpf = bpf_peers_present(ifp->if_bpf);
3271	if (has_bpf) {
3272		/*
3273		 * Make sure that this txd and any aggregated txds are not
3274		 * freed before ETHER_BPF_MTAP.
3275		 */
3276		hn_txdesc_hold(txd);
3277	}
3278	error = txr->hn_sendpkt(txr, txd);
3279	if (!error) {
3280		if (has_bpf) {
3281			const struct hn_txdesc *tmp_txd;
3282
3283			ETHER_BPF_MTAP(ifp, txd->m);
3284			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3285				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3286		}
3287
3288		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3289#ifdef HN_IFSTART_SUPPORT
3290		if (!hn_use_if_start)
3291#endif
3292		{
3293			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3294			    txr->hn_stat_size);
3295			if (txr->hn_stat_mcasts != 0) {
3296				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3297				    txr->hn_stat_mcasts);
3298			}
3299		}
3300		txr->hn_pkts += txr->hn_stat_pkts;
3301		txr->hn_sends++;
3302	}
3303	if (has_bpf)
3304		hn_txdesc_put(txr, txd);
3305
3306	if (__predict_false(error)) {
3307		int freed;
3308
3309		/*
3310		 * This should "really rarely" happen.
3311		 *
3312		 * XXX Too many RX to be acked or too many sideband
3313		 * commands to run?  Ask netvsc_channel_rollup()
3314		 * to kick start later.
3315		 */
3316		txr->hn_has_txeof = 1;
3317		if (!send_failed) {
3318			txr->hn_send_failed++;
3319			send_failed = 1;
3320			/*
3321			 * Try sending again after set hn_has_txeof;
3322			 * in case that we missed the last
3323			 * netvsc_channel_rollup().
3324			 */
3325			goto again;
3326		}
3327		if_printf(ifp, "send failed\n");
3328
3329		/*
3330		 * Caller will perform further processing on the
3331		 * associated mbuf, so don't free it in hn_txdesc_put();
3332		 * only unload it from the DMA map in hn_txdesc_put(),
3333		 * if it was loaded.
3334		 */
3335		txd->m = NULL;
3336		freed = hn_txdesc_put(txr, txd);
3337		KASSERT(freed != 0,
3338		    ("fail to free txd upon send error"));
3339
3340		txr->hn_send_failed++;
3341	}
3342
3343	/* Reset temporary stats, after this sending is done. */
3344	txr->hn_stat_size = 0;
3345	txr->hn_stat_pkts = 0;
3346	txr->hn_stat_mcasts = 0;
3347
3348	return (error);
3349}
3350
3351/*
3352 * Append the specified data to the indicated mbuf chain,
3353 * Extend the mbuf chain if the new data does not fit in
3354 * existing space.
3355 *
3356 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3357 * There should be an equivalent in the kernel mbuf code,
3358 * but there does not appear to be one yet.
3359 *
3360 * Differs from m_append() in that additional mbufs are
3361 * allocated with cluster size MJUMPAGESIZE, and filled
3362 * accordingly.
3363 *
3364 * Return the last mbuf in the chain or NULL if failed to
3365 * allocate new mbuf.
3366 */
3367static struct mbuf *
3368hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3369{
3370	struct mbuf *m, *n;
3371	int remainder, space;
3372
3373	for (m = m0; m->m_next != NULL; m = m->m_next)
3374		;
3375	remainder = len;
3376	space = M_TRAILINGSPACE(m);
3377	if (space > 0) {
3378		/*
3379		 * Copy into available space.
3380		 */
3381		if (space > remainder)
3382			space = remainder;
3383		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3384		m->m_len += space;
3385		cp += space;
3386		remainder -= space;
3387	}
3388	while (remainder > 0) {
3389		/*
3390		 * Allocate a new mbuf; could check space
3391		 * and allocate a cluster instead.
3392		 */
3393		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3394		if (n == NULL)
3395			return NULL;
3396		n->m_len = min(MJUMPAGESIZE, remainder);
3397		bcopy(cp, mtod(n, caddr_t), n->m_len);
3398		cp += n->m_len;
3399		remainder -= n->m_len;
3400		m->m_next = n;
3401		m = n;
3402	}
3403
3404	return m;
3405}
3406
3407#if defined(INET) || defined(INET6)
3408static __inline int
3409hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3410{
3411#if __FreeBSD_version >= 1100095
3412	if (hn_lro_mbufq_depth) {
3413		tcp_lro_queue_mbuf(lc, m);
3414		return 0;
3415	}
3416#endif
3417	return tcp_lro_rx(lc, m, 0);
3418}
3419#endif
3420
3421static int
3422hn_rxpkt(struct hn_rx_ring *rxr)
3423{
3424	struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3425	struct mbuf *m_new, *n;
3426	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3427	int hash_type = M_HASHTYPE_NONE;
3428	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3429	int i;
3430
3431	ifp = hn_ifp;
3432	if (rxr->hn_rxvf_ifp != NULL) {
3433		/*
3434		 * Non-transparent mode VF; pretend this packet is from
3435		 * the VF.
3436		 */
3437		ifp = rxr->hn_rxvf_ifp;
3438		is_vf = 1;
3439	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3440		/* Transparent mode VF. */
3441		is_vf = 1;
3442	}
3443
3444	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3445		/*
3446		 * NOTE:
3447		 * See the NOTE of hn_rndis_init_fixat().  This
3448		 * function can be reached, immediately after the
3449		 * RNDIS is initialized but before the ifnet is
3450		 * setup on the hn_attach() path; drop the unexpected
3451		 * packets.
3452		 */
3453		return (0);
3454	}
3455
3456	if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3457		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3458		return (0);
3459	}
3460
3461	if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3462		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3463		if (m_new == NULL) {
3464			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3465			return (0);
3466		}
3467		memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3468		    rxr->rsc.frag_len[0]);
3469		m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3470	} else {
3471		/*
3472		 * Get an mbuf with a cluster.  For packets 2K or less,
3473		 * get a standard 2K cluster.  For anything larger, get a
3474		 * 4K cluster.  Any buffers larger than 4K can cause problems
3475		 * if looped around to the Hyper-V TX channel, so avoid them.
3476		 */
3477		size = MCLBYTES;
3478		if (rxr->rsc.pktlen > MCLBYTES) {
3479			/* 4096 */
3480			size = MJUMPAGESIZE;
3481		}
3482
3483		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3484		if (m_new == NULL) {
3485			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3486			return (0);
3487		}
3488
3489		n = m_new;
3490		for (i = 0; i < rxr->rsc.cnt; i++) {
3491			n = hv_m_append(n, rxr->rsc.frag_len[i],
3492			    rxr->rsc.frag_data[i]);
3493			if (n == NULL) {
3494				if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3495				return (0);
3496			} else {
3497				m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3498			}
3499		}
3500	}
3501	if (rxr->rsc.pktlen <= MHLEN)
3502		rxr->hn_small_pkts++;
3503
3504	m_new->m_pkthdr.rcvif = ifp;
3505
3506	if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3507		do_csum = 0;
3508
3509	/* receive side checksum offload */
3510	if (rxr->rsc.csum_info != NULL) {
3511		/* IP csum offload */
3512		if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3513			m_new->m_pkthdr.csum_flags |=
3514			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3515			rxr->hn_csum_ip++;
3516		}
3517
3518		/* TCP/UDP csum offload */
3519		if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3520		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3521			m_new->m_pkthdr.csum_flags |=
3522			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3523			m_new->m_pkthdr.csum_data = 0xffff;
3524			if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3525				rxr->hn_csum_tcp++;
3526			else
3527				rxr->hn_csum_udp++;
3528		}
3529
3530		/*
3531		 * XXX
3532		 * As of this write (Oct 28th, 2016), host side will turn
3533		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3534		 * the do_lro setting here is actually _not_ accurate.  We
3535		 * depend on the RSS hash type check to reset do_lro.
3536		 */
3537		if ((*(rxr->rsc.csum_info) &
3538		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3539		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3540			do_lro = 1;
3541	} else {
3542		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3543		if (l3proto == ETHERTYPE_IP) {
3544			if (l4proto == IPPROTO_TCP) {
3545				if (do_csum &&
3546				    (rxr->hn_trust_hcsum &
3547				     HN_TRUST_HCSUM_TCP)) {
3548					rxr->hn_csum_trusted++;
3549					m_new->m_pkthdr.csum_flags |=
3550					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3551					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3552					m_new->m_pkthdr.csum_data = 0xffff;
3553				}
3554				do_lro = 1;
3555			} else if (l4proto == IPPROTO_UDP) {
3556				if (do_csum &&
3557				    (rxr->hn_trust_hcsum &
3558				     HN_TRUST_HCSUM_UDP)) {
3559					rxr->hn_csum_trusted++;
3560					m_new->m_pkthdr.csum_flags |=
3561					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3562					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3563					m_new->m_pkthdr.csum_data = 0xffff;
3564				}
3565			} else if (l4proto != IPPROTO_DONE && do_csum &&
3566			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3567				rxr->hn_csum_trusted++;
3568				m_new->m_pkthdr.csum_flags |=
3569				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3570			}
3571		}
3572	}
3573
3574	if (rxr->rsc.vlan_info != NULL) {
3575		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3576		    NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3577		    NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3578		    NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3579		m_new->m_flags |= M_VLANTAG;
3580	}
3581
3582	/*
3583	 * If VF is activated (tranparent/non-transparent mode does not
3584	 * matter here).
3585	 *
3586	 * - Disable LRO
3587	 *
3588	 *   hn(4) will only receive broadcast packets, multicast packets,
3589	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3590	 *   packet types.
3591	 *
3592	 *   For non-transparent, we definitely _cannot_ enable LRO at
3593	 *   all, since the LRO flush will use hn(4) as the receiving
3594	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3595	 */
3596	if (is_vf)
3597		do_lro = 0;
3598
3599	/*
3600	 * If VF is activated (tranparent/non-transparent mode does not
3601	 * matter here), do _not_ mess with unsupported hash types or
3602	 * functions.
3603	 */
3604	if (rxr->rsc.hash_info != NULL) {
3605		rxr->hn_rss_pkts++;
3606		m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3607		if (!is_vf)
3608			hash_type = M_HASHTYPE_OPAQUE_HASH;
3609		if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3610		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3611			uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3612			    rxr->hn_mbuf_hash);
3613
3614			/*
3615			 * NOTE:
3616			 * do_lro is resetted, if the hash types are not TCP
3617			 * related.  See the comment in the above csum_flags
3618			 * setup section.
3619			 */
3620			switch (type) {
3621			case NDIS_HASH_IPV4:
3622				hash_type = M_HASHTYPE_RSS_IPV4;
3623				do_lro = 0;
3624				break;
3625
3626			case NDIS_HASH_TCP_IPV4:
3627				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3628				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3629					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3630
3631					if (is_vf)
3632						def_htype = M_HASHTYPE_NONE;
3633
3634					/*
3635					 * UDP 4-tuple hash is delivered as
3636					 * TCP 4-tuple hash.
3637					 */
3638					if (l3proto == ETHERTYPE_MAX) {
3639						hn_rxpkt_proto(m_new,
3640						    &l3proto, &l4proto);
3641					}
3642					if (l3proto == ETHERTYPE_IP) {
3643						if (l4proto == IPPROTO_UDP &&
3644						    (rxr->hn_mbuf_hash &
3645						     NDIS_HASH_UDP_IPV4_X)) {
3646							hash_type =
3647							M_HASHTYPE_RSS_UDP_IPV4;
3648							do_lro = 0;
3649						} else if (l4proto !=
3650						    IPPROTO_TCP) {
3651							hash_type = def_htype;
3652							do_lro = 0;
3653						}
3654					} else {
3655						hash_type = def_htype;
3656						do_lro = 0;
3657					}
3658				}
3659				break;
3660
3661			case NDIS_HASH_IPV6:
3662				hash_type = M_HASHTYPE_RSS_IPV6;
3663				do_lro = 0;
3664				break;
3665
3666			case NDIS_HASH_IPV6_EX:
3667				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3668				do_lro = 0;
3669				break;
3670
3671			case NDIS_HASH_TCP_IPV6:
3672				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3673				break;
3674
3675			case NDIS_HASH_TCP_IPV6_EX:
3676				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3677				break;
3678			}
3679		}
3680	} else if (!is_vf) {
3681		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3682		hash_type = M_HASHTYPE_OPAQUE;
3683	}
3684	M_HASHTYPE_SET(m_new, hash_type);
3685
3686	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3687	if (hn_ifp != ifp) {
3688		const struct ether_header *eh;
3689
3690		/*
3691		 * Non-transparent mode VF is activated.
3692		 */
3693
3694		/*
3695		 * Allow tapping on hn(4).
3696		 */
3697		ETHER_BPF_MTAP(hn_ifp, m_new);
3698
3699		/*
3700		 * Update hn(4)'s stats.
3701		 */
3702		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3703		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3704		/* Checked at the beginning of this function. */
3705		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3706		eh = mtod(m_new, struct ether_header *);
3707		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3708			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3709	}
3710	rxr->hn_pkts++;
3711
3712	if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3713#if defined(INET) || defined(INET6)
3714		struct lro_ctrl *lro = &rxr->hn_lro;
3715
3716		if (lro->lro_cnt) {
3717			rxr->hn_lro_tried++;
3718			if (hn_lro_rx(lro, m_new) == 0) {
3719				/* DONE! */
3720				return 0;
3721			}
3722		}
3723#endif
3724	}
3725	ifp->if_input(ifp, m_new);
3726
3727	return (0);
3728}
3729
3730static int
3731hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3732{
3733	struct hn_softc *sc = ifp->if_softc;
3734	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3735	struct ifnet *vf_ifp;
3736	int mask, error = 0;
3737	struct ifrsskey *ifrk;
3738	struct ifrsshash *ifrh;
3739	uint32_t mtu;
3740
3741	switch (cmd) {
3742	case SIOCSIFMTU:
3743		if (ifr->ifr_mtu > HN_MTU_MAX) {
3744			error = EINVAL;
3745			break;
3746		}
3747
3748		HN_LOCK(sc);
3749
3750		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3751			HN_UNLOCK(sc);
3752			break;
3753		}
3754
3755		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3756			/* Can't change MTU */
3757			HN_UNLOCK(sc);
3758			error = EOPNOTSUPP;
3759			break;
3760		}
3761
3762		if (ifp->if_mtu == ifr->ifr_mtu) {
3763			HN_UNLOCK(sc);
3764			break;
3765		}
3766
3767		if (hn_xpnt_vf_isready(sc)) {
3768			vf_ifp = sc->hn_vf_ifp;
3769			ifr_vf = *ifr;
3770			strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3771			    sizeof(ifr_vf.ifr_name));
3772			error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3773			    (caddr_t)&ifr_vf);
3774			if (error) {
3775				HN_UNLOCK(sc);
3776				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3777				    vf_ifp->if_xname, ifr->ifr_mtu, error);
3778				break;
3779			}
3780		}
3781
3782		/*
3783		 * Suspend this interface before the synthetic parts
3784		 * are ripped.
3785		 */
3786		hn_suspend(sc);
3787
3788		/*
3789		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3790		 */
3791		hn_synth_detach(sc);
3792
3793		/*
3794		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3795		 * with the new MTU setting.
3796		 */
3797		error = hn_synth_attach(sc, ifr->ifr_mtu);
3798		if (error) {
3799			HN_UNLOCK(sc);
3800			break;
3801		}
3802
3803		error = hn_rndis_get_mtu(sc, &mtu);
3804		if (error)
3805			mtu = ifr->ifr_mtu;
3806		else if (bootverbose)
3807			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3808
3809		/*
3810		 * Commit the requested MTU, after the synthetic parts
3811		 * have been successfully attached.
3812		 */
3813		if (mtu >= ifr->ifr_mtu) {
3814			mtu = ifr->ifr_mtu;
3815		} else {
3816			if_printf(ifp, "fixup mtu %d -> %u\n",
3817			    ifr->ifr_mtu, mtu);
3818		}
3819		ifp->if_mtu = mtu;
3820
3821		/*
3822		 * Synthetic parts' reattach may change the chimney
3823		 * sending size; update it.
3824		 */
3825		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3826			hn_set_chim_size(sc, sc->hn_chim_szmax);
3827
3828		/*
3829		 * Make sure that various parameters based on MTU are
3830		 * still valid, after the MTU change.
3831		 */
3832		hn_mtu_change_fixup(sc);
3833
3834		/*
3835		 * All done!  Resume the interface now.
3836		 */
3837		hn_resume(sc);
3838
3839		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3840		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3841			/*
3842			 * Since we have reattached the NVS part,
3843			 * change the datapath to VF again; in case
3844			 * that it is lost, after the NVS was detached.
3845			 */
3846			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3847		}
3848
3849		HN_UNLOCK(sc);
3850		break;
3851
3852	case SIOCSIFFLAGS:
3853		HN_LOCK(sc);
3854
3855		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3856			HN_UNLOCK(sc);
3857			break;
3858		}
3859
3860		if (hn_xpnt_vf_isready(sc))
3861			hn_xpnt_vf_saveifflags(sc);
3862
3863		if (ifp->if_flags & IFF_UP) {
3864			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3865				/*
3866				 * Caller meight hold mutex, e.g.
3867				 * bpf; use busy-wait for the RNDIS
3868				 * reply.
3869				 */
3870				HN_NO_SLEEPING(sc);
3871				hn_rxfilter_config(sc);
3872				HN_SLEEPING_OK(sc);
3873
3874				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3875					error = hn_xpnt_vf_iocsetflags(sc);
3876			} else {
3877				hn_init_locked(sc);
3878			}
3879		} else {
3880			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3881				hn_stop(sc, false);
3882		}
3883		sc->hn_if_flags = ifp->if_flags;
3884
3885		HN_UNLOCK(sc);
3886		break;
3887
3888	case SIOCSIFCAP:
3889		HN_LOCK(sc);
3890
3891		if (hn_xpnt_vf_isready(sc)) {
3892			ifr_vf = *ifr;
3893			strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3894			    sizeof(ifr_vf.ifr_name));
3895			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3896			HN_UNLOCK(sc);
3897			break;
3898		}
3899
3900		/*
3901		 * Fix up requested capabilities w/ supported capabilities,
3902		 * since the supported capabilities could have been changed.
3903		 */
3904		mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3905		    ifp->if_capenable;
3906
3907		if (mask & IFCAP_TXCSUM) {
3908			ifp->if_capenable ^= IFCAP_TXCSUM;
3909			if (ifp->if_capenable & IFCAP_TXCSUM)
3910				ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3911			else
3912				ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3913		}
3914		if (mask & IFCAP_TXCSUM_IPV6) {
3915			ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3916			if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3917				ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3918			else
3919				ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3920		}
3921
3922		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3923		if (mask & IFCAP_RXCSUM)
3924			ifp->if_capenable ^= IFCAP_RXCSUM;
3925#ifdef foo
3926		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3927		if (mask & IFCAP_RXCSUM_IPV6)
3928			ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3929#endif
3930
3931		if (mask & IFCAP_LRO)
3932			ifp->if_capenable ^= IFCAP_LRO;
3933
3934		if (mask & IFCAP_TSO4) {
3935			ifp->if_capenable ^= IFCAP_TSO4;
3936			if (ifp->if_capenable & IFCAP_TSO4)
3937				ifp->if_hwassist |= CSUM_IP_TSO;
3938			else
3939				ifp->if_hwassist &= ~CSUM_IP_TSO;
3940		}
3941		if (mask & IFCAP_TSO6) {
3942			ifp->if_capenable ^= IFCAP_TSO6;
3943			if (ifp->if_capenable & IFCAP_TSO6)
3944				ifp->if_hwassist |= CSUM_IP6_TSO;
3945			else
3946				ifp->if_hwassist &= ~CSUM_IP6_TSO;
3947		}
3948
3949		HN_UNLOCK(sc);
3950		break;
3951
3952	case SIOCADDMULTI:
3953	case SIOCDELMULTI:
3954		HN_LOCK(sc);
3955
3956		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3957			HN_UNLOCK(sc);
3958			break;
3959		}
3960		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3961			/*
3962			 * Multicast uses mutex; use busy-wait for
3963			 * the RNDIS reply.
3964			 */
3965			HN_NO_SLEEPING(sc);
3966			hn_rxfilter_config(sc);
3967			HN_SLEEPING_OK(sc);
3968		}
3969
3970		/* XXX vlan(4) style mcast addr maintenance */
3971		if (hn_xpnt_vf_isready(sc)) {
3972			int old_if_flags;
3973
3974			old_if_flags = sc->hn_vf_ifp->if_flags;
3975			hn_xpnt_vf_saveifflags(sc);
3976
3977			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3978			    ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3979			     IFF_ALLMULTI))
3980				error = hn_xpnt_vf_iocsetflags(sc);
3981		}
3982
3983		HN_UNLOCK(sc);
3984		break;
3985
3986	case SIOCSIFMEDIA:
3987	case SIOCGIFMEDIA:
3988		HN_LOCK(sc);
3989		if (hn_xpnt_vf_isready(sc)) {
3990			/*
3991			 * SIOCGIFMEDIA expects ifmediareq, so don't
3992			 * create and pass ifr_vf to the VF here; just
3993			 * replace the ifr_name.
3994			 */
3995			vf_ifp = sc->hn_vf_ifp;
3996			strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3997			    sizeof(ifr->ifr_name));
3998			error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3999			/* Restore the ifr_name. */
4000			strlcpy(ifr->ifr_name, ifp->if_xname,
4001			    sizeof(ifr->ifr_name));
4002			HN_UNLOCK(sc);
4003			break;
4004		}
4005		HN_UNLOCK(sc);
4006		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
4007		break;
4008
4009	case SIOCGIFRSSHASH:
4010		ifrh = (struct ifrsshash *)data;
4011		HN_LOCK(sc);
4012		if (sc->hn_rx_ring_inuse == 1) {
4013			HN_UNLOCK(sc);
4014			ifrh->ifrh_func = RSS_FUNC_NONE;
4015			ifrh->ifrh_types = 0;
4016			break;
4017		}
4018
4019		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4020			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4021		else
4022			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4023		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4024		HN_UNLOCK(sc);
4025		break;
4026
4027	case SIOCGIFRSSKEY:
4028		ifrk = (struct ifrsskey *)data;
4029		HN_LOCK(sc);
4030		if (sc->hn_rx_ring_inuse == 1) {
4031			HN_UNLOCK(sc);
4032			ifrk->ifrk_func = RSS_FUNC_NONE;
4033			ifrk->ifrk_keylen = 0;
4034			break;
4035		}
4036		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4037			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4038		else
4039			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4040		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4041		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4042		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4043		HN_UNLOCK(sc);
4044		break;
4045
4046	default:
4047		error = ether_ioctl(ifp, cmd, data);
4048		break;
4049	}
4050	return (error);
4051}
4052
4053static void
4054hn_stop(struct hn_softc *sc, bool detaching)
4055{
4056	struct ifnet *ifp = sc->hn_ifp;
4057	int i;
4058
4059	HN_LOCK_ASSERT(sc);
4060
4061	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4062	    ("synthetic parts were not attached"));
4063
4064	/* Clear RUNNING bit ASAP. */
4065	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4066
4067	/* Disable polling. */
4068	hn_polling(sc, 0);
4069
4070	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4071		KASSERT(sc->hn_vf_ifp != NULL,
4072		    ("%s: VF is not attached", ifp->if_xname));
4073
4074		/* Mark transparent mode VF as disabled. */
4075		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4076
4077		/*
4078		 * NOTE:
4079		 * Datapath setting must happen _before_ bringing
4080		 * the VF down.
4081		 */
4082		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4083
4084		/*
4085		 * Bring the VF down.
4086		 */
4087		hn_xpnt_vf_saveifflags(sc);
4088		sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4089		hn_xpnt_vf_iocsetflags(sc);
4090	}
4091
4092	/* Suspend data transfers. */
4093	hn_suspend_data(sc);
4094
4095	/* Clear OACTIVE bit. */
4096	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4097	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4098		sc->hn_tx_ring[i].hn_oactive = 0;
4099
4100	/*
4101	 * If the non-transparent mode VF is active, make sure
4102	 * that the RX filter still allows packet reception.
4103	 */
4104	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4105		hn_rxfilter_config(sc);
4106}
4107
4108static void
4109hn_init_locked(struct hn_softc *sc)
4110{
4111	struct ifnet *ifp = sc->hn_ifp;
4112	int i;
4113
4114	HN_LOCK_ASSERT(sc);
4115
4116	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4117		return;
4118
4119	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4120		return;
4121
4122	/* Configure RX filter */
4123	hn_rxfilter_config(sc);
4124
4125	/* Clear OACTIVE bit. */
4126	atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4127	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4128		sc->hn_tx_ring[i].hn_oactive = 0;
4129
4130	/* Clear TX 'suspended' bit. */
4131	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4132
4133	if (hn_xpnt_vf_isready(sc)) {
4134		/* Initialize transparent VF. */
4135		hn_xpnt_vf_init(sc);
4136	}
4137
4138	/* Everything is ready; unleash! */
4139	atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4140
4141	/* Re-enable polling if requested. */
4142	if (sc->hn_pollhz > 0)
4143		hn_polling(sc, sc->hn_pollhz);
4144}
4145
4146static void
4147hn_init(void *xsc)
4148{
4149	struct hn_softc *sc = xsc;
4150
4151	HN_LOCK(sc);
4152	hn_init_locked(sc);
4153	HN_UNLOCK(sc);
4154}
4155
4156#if __FreeBSD_version >= 1100099
4157
4158static int
4159hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4160{
4161	struct hn_softc *sc = arg1;
4162	unsigned int lenlim;
4163	int error;
4164
4165	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4166	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4167	if (error || req->newptr == NULL)
4168		return error;
4169
4170	HN_LOCK(sc);
4171	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4172	    lenlim > TCP_LRO_LENGTH_MAX) {
4173		HN_UNLOCK(sc);
4174		return EINVAL;
4175	}
4176	hn_set_lro_lenlim(sc, lenlim);
4177	HN_UNLOCK(sc);
4178
4179	return 0;
4180}
4181
4182static int
4183hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4184{
4185	struct hn_softc *sc = arg1;
4186	int ackcnt, error, i;
4187
4188	/*
4189	 * lro_ackcnt_lim is append count limit,
4190	 * +1 to turn it into aggregation limit.
4191	 */
4192	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4193	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4194	if (error || req->newptr == NULL)
4195		return error;
4196
4197	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4198		return EINVAL;
4199
4200	/*
4201	 * Convert aggregation limit back to append
4202	 * count limit.
4203	 */
4204	--ackcnt;
4205	HN_LOCK(sc);
4206	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4207		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4208	HN_UNLOCK(sc);
4209	return 0;
4210}
4211
4212#endif
4213
4214static int
4215hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4216{
4217	struct hn_softc *sc = arg1;
4218	int hcsum = arg2;
4219	int on, error, i;
4220
4221	on = 0;
4222	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4223		on = 1;
4224
4225	error = sysctl_handle_int(oidp, &on, 0, req);
4226	if (error || req->newptr == NULL)
4227		return error;
4228
4229	HN_LOCK(sc);
4230	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4231		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4232
4233		if (on)
4234			rxr->hn_trust_hcsum |= hcsum;
4235		else
4236			rxr->hn_trust_hcsum &= ~hcsum;
4237	}
4238	HN_UNLOCK(sc);
4239	return 0;
4240}
4241
4242static int
4243hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4244{
4245	struct hn_softc *sc = arg1;
4246	int chim_size, error;
4247
4248	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4249	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4250	if (error || req->newptr == NULL)
4251		return error;
4252
4253	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4254		return EINVAL;
4255
4256	HN_LOCK(sc);
4257	hn_set_chim_size(sc, chim_size);
4258	HN_UNLOCK(sc);
4259	return 0;
4260}
4261
4262#if __FreeBSD_version < 1100095
4263static int
4264hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4265{
4266	struct hn_softc *sc = arg1;
4267	int ofs = arg2, i, error;
4268	struct hn_rx_ring *rxr;
4269	uint64_t stat;
4270
4271	stat = 0;
4272	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4273		rxr = &sc->hn_rx_ring[i];
4274		stat += *((int *)((uint8_t *)rxr + ofs));
4275	}
4276
4277	error = sysctl_handle_64(oidp, &stat, 0, req);
4278	if (error || req->newptr == NULL)
4279		return error;
4280
4281	/* Zero out this stat. */
4282	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4283		rxr = &sc->hn_rx_ring[i];
4284		*((int *)((uint8_t *)rxr + ofs)) = 0;
4285	}
4286	return 0;
4287}
4288#else
4289static int
4290hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4291{
4292	struct hn_softc *sc = arg1;
4293	int ofs = arg2, i, error;
4294	struct hn_rx_ring *rxr;
4295	uint64_t stat;
4296
4297	stat = 0;
4298	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4299		rxr = &sc->hn_rx_ring[i];
4300		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4301	}
4302
4303	error = sysctl_handle_64(oidp, &stat, 0, req);
4304	if (error || req->newptr == NULL)
4305		return error;
4306
4307	/* Zero out this stat. */
4308	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4309		rxr = &sc->hn_rx_ring[i];
4310		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4311	}
4312	return 0;
4313}
4314
4315#endif
4316
4317static int
4318hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4319{
4320	struct hn_softc *sc = arg1;
4321	int ofs = arg2, i, error;
4322	struct hn_rx_ring *rxr;
4323	u_long stat;
4324
4325	stat = 0;
4326	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4327		rxr = &sc->hn_rx_ring[i];
4328		stat += *((u_long *)((uint8_t *)rxr + ofs));
4329	}
4330
4331	error = sysctl_handle_long(oidp, &stat, 0, req);
4332	if (error || req->newptr == NULL)
4333		return error;
4334
4335	/* Zero out this stat. */
4336	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4337		rxr = &sc->hn_rx_ring[i];
4338		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4339	}
4340	return 0;
4341}
4342
4343static int
4344hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4345{
4346	struct hn_softc *sc = arg1;
4347	int ofs = arg2, i, error;
4348	struct hn_tx_ring *txr;
4349	u_long stat;
4350
4351	stat = 0;
4352	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4353		txr = &sc->hn_tx_ring[i];
4354		stat += *((u_long *)((uint8_t *)txr + ofs));
4355	}
4356
4357	error = sysctl_handle_long(oidp, &stat, 0, req);
4358	if (error || req->newptr == NULL)
4359		return error;
4360
4361	/* Zero out this stat. */
4362	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4363		txr = &sc->hn_tx_ring[i];
4364		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4365	}
4366	return 0;
4367}
4368
4369static int
4370hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4371{
4372	struct hn_softc *sc = arg1;
4373	int ofs = arg2, i, error, conf;
4374	struct hn_tx_ring *txr;
4375
4376	txr = &sc->hn_tx_ring[0];
4377	conf = *((int *)((uint8_t *)txr + ofs));
4378
4379	error = sysctl_handle_int(oidp, &conf, 0, req);
4380	if (error || req->newptr == NULL)
4381		return error;
4382
4383	HN_LOCK(sc);
4384	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4385		txr = &sc->hn_tx_ring[i];
4386		*((int *)((uint8_t *)txr + ofs)) = conf;
4387	}
4388	HN_UNLOCK(sc);
4389
4390	return 0;
4391}
4392
4393static int
4394hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4395{
4396	struct hn_softc *sc = arg1;
4397	int error, size;
4398
4399	size = sc->hn_agg_size;
4400	error = sysctl_handle_int(oidp, &size, 0, req);
4401	if (error || req->newptr == NULL)
4402		return (error);
4403
4404	HN_LOCK(sc);
4405	sc->hn_agg_size = size;
4406	hn_set_txagg(sc);
4407	HN_UNLOCK(sc);
4408
4409	return (0);
4410}
4411
4412static int
4413hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4414{
4415	struct hn_softc *sc = arg1;
4416	int error, pkts;
4417
4418	pkts = sc->hn_agg_pkts;
4419	error = sysctl_handle_int(oidp, &pkts, 0, req);
4420	if (error || req->newptr == NULL)
4421		return (error);
4422
4423	HN_LOCK(sc);
4424	sc->hn_agg_pkts = pkts;
4425	hn_set_txagg(sc);
4426	HN_UNLOCK(sc);
4427
4428	return (0);
4429}
4430
4431static int
4432hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4433{
4434	struct hn_softc *sc = arg1;
4435	int pkts;
4436
4437	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4438	return (sysctl_handle_int(oidp, &pkts, 0, req));
4439}
4440
4441static int
4442hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4443{
4444	struct hn_softc *sc = arg1;
4445	int align;
4446
4447	align = sc->hn_tx_ring[0].hn_agg_align;
4448	return (sysctl_handle_int(oidp, &align, 0, req));
4449}
4450
4451static void
4452hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4453{
4454	if (pollhz == 0)
4455		vmbus_chan_poll_disable(chan);
4456	else
4457		vmbus_chan_poll_enable(chan, pollhz);
4458}
4459
4460static void
4461hn_polling(struct hn_softc *sc, u_int pollhz)
4462{
4463	int nsubch = sc->hn_rx_ring_inuse - 1;
4464
4465	HN_LOCK_ASSERT(sc);
4466
4467	if (nsubch > 0) {
4468		struct vmbus_channel **subch;
4469		int i;
4470
4471		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4472		for (i = 0; i < nsubch; ++i)
4473			hn_chan_polling(subch[i], pollhz);
4474		vmbus_subchan_rel(subch, nsubch);
4475	}
4476	hn_chan_polling(sc->hn_prichan, pollhz);
4477}
4478
4479static int
4480hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4481{
4482	struct hn_softc *sc = arg1;
4483	int pollhz, error;
4484
4485	pollhz = sc->hn_pollhz;
4486	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4487	if (error || req->newptr == NULL)
4488		return (error);
4489
4490	if (pollhz != 0 &&
4491	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4492		return (EINVAL);
4493
4494	HN_LOCK(sc);
4495	if (sc->hn_pollhz != pollhz) {
4496		sc->hn_pollhz = pollhz;
4497		if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4498		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4499			hn_polling(sc, sc->hn_pollhz);
4500	}
4501	HN_UNLOCK(sc);
4502
4503	return (0);
4504}
4505
4506static int
4507hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4508{
4509	struct hn_softc *sc = arg1;
4510	char verstr[16];
4511
4512	snprintf(verstr, sizeof(verstr), "%u.%u",
4513	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4514	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4515	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4516}
4517
4518static int
4519hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4520{
4521	struct hn_softc *sc = arg1;
4522	char caps_str[128];
4523	uint32_t caps;
4524
4525	HN_LOCK(sc);
4526	caps = sc->hn_caps;
4527	HN_UNLOCK(sc);
4528	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4529	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4530}
4531
4532static int
4533hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4534{
4535	struct hn_softc *sc = arg1;
4536	char assist_str[128];
4537	uint32_t hwassist;
4538
4539	HN_LOCK(sc);
4540	hwassist = sc->hn_ifp->if_hwassist;
4541	HN_UNLOCK(sc);
4542	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4543	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4544}
4545
4546static int
4547hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4548{
4549	struct hn_softc *sc = arg1;
4550	char filter_str[128];
4551	uint32_t filter;
4552
4553	HN_LOCK(sc);
4554	filter = sc->hn_rx_filter;
4555	HN_UNLOCK(sc);
4556	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4557	    NDIS_PACKET_TYPES);
4558	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4559}
4560
4561#ifndef RSS
4562
4563static int
4564hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4565{
4566	struct hn_softc *sc = arg1;
4567	int error;
4568
4569	HN_LOCK(sc);
4570
4571	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4572	if (error || req->newptr == NULL)
4573		goto back;
4574
4575	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4576	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4577		/*
4578		 * RSS key is synchronized w/ VF's, don't allow users
4579		 * to change it.
4580		 */
4581		error = EBUSY;
4582		goto back;
4583	}
4584
4585	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4586	if (error)
4587		goto back;
4588	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4589
4590	if (sc->hn_rx_ring_inuse > 1) {
4591		error = hn_rss_reconfig(sc);
4592	} else {
4593		/* Not RSS capable, at least for now; just save the RSS key. */
4594		error = 0;
4595	}
4596back:
4597	HN_UNLOCK(sc);
4598	return (error);
4599}
4600
4601static int
4602hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4603{
4604	struct hn_softc *sc = arg1;
4605	int error;
4606
4607	HN_LOCK(sc);
4608
4609	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4610	if (error || req->newptr == NULL)
4611		goto back;
4612
4613	/*
4614	 * Don't allow RSS indirect table change, if this interface is not
4615	 * RSS capable currently.
4616	 */
4617	if (sc->hn_rx_ring_inuse == 1) {
4618		error = EOPNOTSUPP;
4619		goto back;
4620	}
4621
4622	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4623	if (error)
4624		goto back;
4625	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4626
4627	hn_rss_ind_fixup(sc);
4628	error = hn_rss_reconfig(sc);
4629back:
4630	HN_UNLOCK(sc);
4631	return (error);
4632}
4633
4634#endif	/* !RSS */
4635
4636static int
4637hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4638{
4639	struct hn_softc *sc = arg1;
4640	char hash_str[128];
4641	uint32_t hash;
4642
4643	HN_LOCK(sc);
4644	hash = sc->hn_rss_hash;
4645	HN_UNLOCK(sc);
4646	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4647	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4648}
4649
4650static int
4651hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4652{
4653	struct hn_softc *sc = arg1;
4654	char hash_str[128];
4655	uint32_t hash;
4656
4657	HN_LOCK(sc);
4658	hash = sc->hn_rss_hcap;
4659	HN_UNLOCK(sc);
4660	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4661	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4662}
4663
4664static int
4665hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4666{
4667	struct hn_softc *sc = arg1;
4668	char hash_str[128];
4669	uint32_t hash;
4670
4671	HN_LOCK(sc);
4672	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4673	HN_UNLOCK(sc);
4674	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4675	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4676}
4677
4678static int
4679hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4680{
4681	struct hn_softc *sc = arg1;
4682	char vf_name[IFNAMSIZ + 1];
4683	struct ifnet *vf_ifp;
4684
4685	HN_LOCK(sc);
4686	vf_name[0] = '\0';
4687	vf_ifp = sc->hn_vf_ifp;
4688	if (vf_ifp != NULL)
4689		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4690	HN_UNLOCK(sc);
4691	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4692}
4693
4694static int
4695hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4696{
4697	struct hn_softc *sc = arg1;
4698	char vf_name[IFNAMSIZ + 1];
4699	struct ifnet *vf_ifp;
4700
4701	HN_LOCK(sc);
4702	vf_name[0] = '\0';
4703	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4704	if (vf_ifp != NULL)
4705		snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4706	HN_UNLOCK(sc);
4707	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4708}
4709
4710static int
4711hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4712{
4713	struct rm_priotracker pt;
4714	struct sbuf *sb;
4715	int error, i;
4716	bool first;
4717
4718	error = sysctl_wire_old_buffer(req, 0);
4719	if (error != 0)
4720		return (error);
4721
4722	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4723	if (sb == NULL)
4724		return (ENOMEM);
4725
4726	rm_rlock(&hn_vfmap_lock, &pt);
4727
4728	first = true;
4729	for (i = 0; i < hn_vfmap_size; ++i) {
4730		struct ifnet *ifp;
4731
4732		if (hn_vfmap[i] == NULL)
4733			continue;
4734
4735		ifp = ifnet_byindex(i);
4736		if (ifp != NULL) {
4737			if (first)
4738				sbuf_printf(sb, "%s", ifp->if_xname);
4739			else
4740				sbuf_printf(sb, " %s", ifp->if_xname);
4741			first = false;
4742		}
4743	}
4744
4745	rm_runlock(&hn_vfmap_lock, &pt);
4746
4747	error = sbuf_finish(sb);
4748	sbuf_delete(sb);
4749	return (error);
4750}
4751
4752static int
4753hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4754{
4755	struct rm_priotracker pt;
4756	struct sbuf *sb;
4757	int error, i;
4758	bool first;
4759
4760	error = sysctl_wire_old_buffer(req, 0);
4761	if (error != 0)
4762		return (error);
4763
4764	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4765	if (sb == NULL)
4766		return (ENOMEM);
4767
4768	rm_rlock(&hn_vfmap_lock, &pt);
4769
4770	first = true;
4771	for (i = 0; i < hn_vfmap_size; ++i) {
4772		struct ifnet *ifp, *hn_ifp;
4773
4774		hn_ifp = hn_vfmap[i];
4775		if (hn_ifp == NULL)
4776			continue;
4777
4778		ifp = ifnet_byindex(i);
4779		if (ifp != NULL) {
4780			if (first) {
4781				sbuf_printf(sb, "%s:%s", ifp->if_xname,
4782				    hn_ifp->if_xname);
4783			} else {
4784				sbuf_printf(sb, " %s:%s", ifp->if_xname,
4785				    hn_ifp->if_xname);
4786			}
4787			first = false;
4788		}
4789	}
4790
4791	rm_runlock(&hn_vfmap_lock, &pt);
4792
4793	error = sbuf_finish(sb);
4794	sbuf_delete(sb);
4795	return (error);
4796}
4797
4798static int
4799hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4800{
4801	struct hn_softc *sc = arg1;
4802	int error, onoff = 0;
4803
4804	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4805		onoff = 1;
4806	error = sysctl_handle_int(oidp, &onoff, 0, req);
4807	if (error || req->newptr == NULL)
4808		return (error);
4809
4810	HN_LOCK(sc);
4811	/* NOTE: hn_vf_lock for hn_transmit() */
4812	rm_wlock(&sc->hn_vf_lock);
4813	if (onoff)
4814		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4815	else
4816		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4817	rm_wunlock(&sc->hn_vf_lock);
4818	HN_UNLOCK(sc);
4819
4820	return (0);
4821}
4822
4823static int
4824hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4825{
4826	struct hn_softc *sc = arg1;
4827	int enabled = 0;
4828
4829	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4830		enabled = 1;
4831	return (sysctl_handle_int(oidp, &enabled, 0, req));
4832}
4833
4834static int
4835hn_check_iplen(const struct mbuf *m, int hoff)
4836{
4837	const struct ip *ip;
4838	int len, iphlen, iplen;
4839	const struct tcphdr *th;
4840	int thoff;				/* TCP data offset */
4841
4842	len = hoff + sizeof(struct ip);
4843
4844	/* The packet must be at least the size of an IP header. */
4845	if (m->m_pkthdr.len < len)
4846		return IPPROTO_DONE;
4847
4848	/* The fixed IP header must reside completely in the first mbuf. */
4849	if (m->m_len < len)
4850		return IPPROTO_DONE;
4851
4852	ip = mtodo(m, hoff);
4853
4854	/* Bound check the packet's stated IP header length. */
4855	iphlen = ip->ip_hl << 2;
4856	if (iphlen < sizeof(struct ip))		/* minimum header length */
4857		return IPPROTO_DONE;
4858
4859	/* The full IP header must reside completely in the one mbuf. */
4860	if (m->m_len < hoff + iphlen)
4861		return IPPROTO_DONE;
4862
4863	iplen = ntohs(ip->ip_len);
4864
4865	/*
4866	 * Check that the amount of data in the buffers is as
4867	 * at least much as the IP header would have us expect.
4868	 */
4869	if (m->m_pkthdr.len < hoff + iplen)
4870		return IPPROTO_DONE;
4871
4872	/*
4873	 * Ignore IP fragments.
4874	 */
4875	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4876		return IPPROTO_DONE;
4877
4878	/*
4879	 * The TCP/IP or UDP/IP header must be entirely contained within
4880	 * the first fragment of a packet.
4881	 */
4882	switch (ip->ip_p) {
4883	case IPPROTO_TCP:
4884		if (iplen < iphlen + sizeof(struct tcphdr))
4885			return IPPROTO_DONE;
4886		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4887			return IPPROTO_DONE;
4888		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4889		thoff = th->th_off << 2;
4890		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4891			return IPPROTO_DONE;
4892		if (m->m_len < hoff + iphlen + thoff)
4893			return IPPROTO_DONE;
4894		break;
4895	case IPPROTO_UDP:
4896		if (iplen < iphlen + sizeof(struct udphdr))
4897			return IPPROTO_DONE;
4898		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4899			return IPPROTO_DONE;
4900		break;
4901	default:
4902		if (iplen < iphlen)
4903			return IPPROTO_DONE;
4904		break;
4905	}
4906	return ip->ip_p;
4907}
4908
4909static void
4910hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4911{
4912	const struct ether_header *eh;
4913	uint16_t etype;
4914	int hoff;
4915
4916	hoff = sizeof(*eh);
4917	/* Checked at the beginning of this function. */
4918	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4919
4920	eh = mtod(m_new, const struct ether_header *);
4921	etype = ntohs(eh->ether_type);
4922	if (etype == ETHERTYPE_VLAN) {
4923		const struct ether_vlan_header *evl;
4924
4925		hoff = sizeof(*evl);
4926		if (m_new->m_len < hoff)
4927			return;
4928		evl = mtod(m_new, const struct ether_vlan_header *);
4929		etype = ntohs(evl->evl_proto);
4930	}
4931	*l3proto = etype;
4932
4933	if (etype == ETHERTYPE_IP)
4934		*l4proto = hn_check_iplen(m_new, hoff);
4935	else
4936		*l4proto = IPPROTO_DONE;
4937}
4938
4939static int
4940hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4941{
4942	struct sysctl_oid_list *child;
4943	struct sysctl_ctx_list *ctx;
4944	device_t dev = sc->hn_dev;
4945#if defined(INET) || defined(INET6)
4946#if __FreeBSD_version >= 1100095
4947	int lroent_cnt;
4948#endif
4949#endif
4950	int i;
4951
4952	/*
4953	 * Create RXBUF for reception.
4954	 *
4955	 * NOTE:
4956	 * - It is shared by all channels.
4957	 * - A large enough buffer is allocated, certain version of NVSes
4958	 *   may further limit the usable space.
4959	 */
4960	sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4961	    PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4962	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
4963	if (sc->hn_rxbuf == NULL) {
4964		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4965		return (ENOMEM);
4966	}
4967
4968	sc->hn_rx_ring_cnt = ring_cnt;
4969	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4970
4971	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4972	    M_DEVBUF, M_WAITOK | M_ZERO);
4973
4974#if defined(INET) || defined(INET6)
4975#if __FreeBSD_version >= 1100095
4976	lroent_cnt = hn_lro_entry_count;
4977	if (lroent_cnt < TCP_LRO_ENTRIES)
4978		lroent_cnt = TCP_LRO_ENTRIES;
4979	if (bootverbose)
4980		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4981#endif
4982#endif	/* INET || INET6 */
4983
4984	ctx = device_get_sysctl_ctx(dev);
4985	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4986
4987	/* Create dev.hn.UNIT.rx sysctl tree */
4988	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4989	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4990
4991	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4992		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4993
4994		rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4995		    PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4996		    &rxr->hn_br_dma, BUS_DMA_WAITOK);
4997		if (rxr->hn_br == NULL) {
4998			device_printf(dev, "allocate bufring failed\n");
4999			return (ENOMEM);
5000		}
5001
5002		if (hn_trust_hosttcp)
5003			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
5004		if (hn_trust_hostudp)
5005			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
5006		if (hn_trust_hostip)
5007			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
5008		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
5009		rxr->hn_ifp = sc->hn_ifp;
5010		if (i < sc->hn_tx_ring_cnt)
5011			rxr->hn_txr = &sc->hn_tx_ring[i];
5012		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
5013		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
5014		rxr->hn_rx_idx = i;
5015		rxr->hn_rxbuf = sc->hn_rxbuf;
5016
5017		/*
5018		 * Initialize LRO.
5019		 */
5020#if defined(INET) || defined(INET6)
5021#if __FreeBSD_version >= 1100095
5022		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
5023		    hn_lro_mbufq_depth);
5024#else
5025		tcp_lro_init(&rxr->hn_lro);
5026		rxr->hn_lro.ifp = sc->hn_ifp;
5027#endif
5028#if __FreeBSD_version >= 1100099
5029		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
5030		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5031#endif
5032#endif	/* INET || INET6 */
5033
5034		if (sc->hn_rx_sysctl_tree != NULL) {
5035			char name[16];
5036
5037			/*
5038			 * Create per RX ring sysctl tree:
5039			 * dev.hn.UNIT.rx.RINGID
5040			 */
5041			snprintf(name, sizeof(name), "%d", i);
5042			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5043			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5044			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5045
5046			if (rxr->hn_rx_sysctl_tree != NULL) {
5047				SYSCTL_ADD_ULONG(ctx,
5048				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5049				    OID_AUTO, "packets", CTLFLAG_RW,
5050				    &rxr->hn_pkts, "# of packets received");
5051				SYSCTL_ADD_ULONG(ctx,
5052				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5053				    OID_AUTO, "rss_pkts", CTLFLAG_RW,
5054				    &rxr->hn_rss_pkts,
5055				    "# of packets w/ RSS info received");
5056				SYSCTL_ADD_ULONG(ctx,
5057				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5058				    OID_AUTO, "rsc_pkts", CTLFLAG_RW,
5059				    &rxr->hn_rsc_pkts,
5060				    "# of RSC packets received");
5061				SYSCTL_ADD_ULONG(ctx,
5062				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5063				    OID_AUTO, "rsc_drop", CTLFLAG_RW,
5064				    &rxr->hn_rsc_drop,
5065				    "# of RSC fragments dropped");
5066				SYSCTL_ADD_INT(ctx,
5067				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5068				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5069				    &rxr->hn_pktbuf_len, 0,
5070				    "Temporary channel packet buffer length");
5071			}
5072		}
5073	}
5074
5075	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5076	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5077	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5078#if __FreeBSD_version < 1100095
5079	    hn_rx_stat_int_sysctl,
5080#else
5081	    hn_rx_stat_u64_sysctl,
5082#endif
5083	    "LU", "LRO queued");
5084	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5085	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5086	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5087#if __FreeBSD_version < 1100095
5088	    hn_rx_stat_int_sysctl,
5089#else
5090	    hn_rx_stat_u64_sysctl,
5091#endif
5092	    "LU", "LRO flushed");
5093	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5094	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5095	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5096	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5097#if __FreeBSD_version >= 1100099
5098	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5099	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5100	    hn_lro_lenlim_sysctl, "IU",
5101	    "Max # of data bytes to be aggregated by LRO");
5102	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5103	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5104	    hn_lro_ackcnt_sysctl, "I",
5105	    "Max # of ACKs to be aggregated by LRO");
5106#endif
5107	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5108	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5109	    hn_trust_hcsum_sysctl, "I",
5110	    "Trust tcp segement verification on host side, "
5111	    "when csum info is missing");
5112	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5113	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5114	    hn_trust_hcsum_sysctl, "I",
5115	    "Trust udp datagram verification on host side, "
5116	    "when csum info is missing");
5117	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5118	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5119	    hn_trust_hcsum_sysctl, "I",
5120	    "Trust ip packet verification on host side, "
5121	    "when csum info is missing");
5122	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5123	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5124	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5125	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5126	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5127	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5128	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5129	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5130	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5131	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5132	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5133	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5134	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5135	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5136	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5137	    hn_rx_stat_ulong_sysctl, "LU",
5138	    "# of packets that we trust host's csum verification");
5139	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5140	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5141	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5142	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5143	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5144	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5145	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5146	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5147	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5148	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5149	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5150	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5151
5152	return (0);
5153}
5154
5155static void
5156hn_destroy_rx_data(struct hn_softc *sc)
5157{
5158	int i;
5159
5160	if (sc->hn_rxbuf != NULL) {
5161		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5162			hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5163		else
5164			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5165		sc->hn_rxbuf = NULL;
5166	}
5167
5168	if (sc->hn_rx_ring_cnt == 0)
5169		return;
5170
5171	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5172		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5173
5174		if (rxr->hn_br == NULL)
5175			continue;
5176		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5177			hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5178		} else {
5179			device_printf(sc->hn_dev,
5180			    "%dth channel bufring is referenced", i);
5181		}
5182		rxr->hn_br = NULL;
5183
5184#if defined(INET) || defined(INET6)
5185		tcp_lro_free(&rxr->hn_lro);
5186#endif
5187		free(rxr->hn_pktbuf, M_DEVBUF);
5188	}
5189	free(sc->hn_rx_ring, M_DEVBUF);
5190	sc->hn_rx_ring = NULL;
5191
5192	sc->hn_rx_ring_cnt = 0;
5193	sc->hn_rx_ring_inuse = 0;
5194}
5195
5196static int
5197hn_tx_ring_create(struct hn_softc *sc, int id)
5198{
5199	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5200	device_t dev = sc->hn_dev;
5201	bus_dma_tag_t parent_dtag;
5202	int error, i;
5203
5204	txr->hn_sc = sc;
5205	txr->hn_tx_idx = id;
5206
5207#ifndef HN_USE_TXDESC_BUFRING
5208	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5209#endif
5210	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5211
5212	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5213	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5214	    M_DEVBUF, M_WAITOK | M_ZERO);
5215#ifndef HN_USE_TXDESC_BUFRING
5216	SLIST_INIT(&txr->hn_txlist);
5217#else
5218	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5219	    M_WAITOK, &txr->hn_tx_lock);
5220#endif
5221
5222	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5223		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5224		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5225	} else {
5226		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5227	}
5228
5229#ifdef HN_IFSTART_SUPPORT
5230	if (hn_use_if_start) {
5231		txr->hn_txeof = hn_start_txeof;
5232		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5233		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5234	} else
5235#endif
5236	{
5237		int br_depth;
5238
5239		txr->hn_txeof = hn_xmit_txeof;
5240		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5241		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5242
5243		br_depth = hn_get_txswq_depth(txr);
5244		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5245		    M_WAITOK, &txr->hn_tx_lock);
5246	}
5247
5248	txr->hn_direct_tx_size = hn_direct_tx_size;
5249
5250	/*
5251	 * Always schedule transmission instead of trying to do direct
5252	 * transmission.  This one gives the best performance so far.
5253	 */
5254	txr->hn_sched_tx = 1;
5255
5256	parent_dtag = bus_get_dma_tag(dev);
5257
5258	/* DMA tag for RNDIS packet messages. */
5259	error = bus_dma_tag_create(parent_dtag, /* parent */
5260	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5261	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5262	    BUS_SPACE_MAXADDR,		/* lowaddr */
5263	    BUS_SPACE_MAXADDR,		/* highaddr */
5264	    NULL, NULL,			/* filter, filterarg */
5265	    HN_RNDIS_PKT_LEN,		/* maxsize */
5266	    1,				/* nsegments */
5267	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5268	    0,				/* flags */
5269	    NULL,			/* lockfunc */
5270	    NULL,			/* lockfuncarg */
5271	    &txr->hn_tx_rndis_dtag);
5272	if (error) {
5273		device_printf(dev, "failed to create rndis dmatag\n");
5274		return error;
5275	}
5276
5277	/* DMA tag for data. */
5278	error = bus_dma_tag_create(parent_dtag, /* parent */
5279	    1,				/* alignment */
5280	    HN_TX_DATA_BOUNDARY,	/* boundary */
5281	    BUS_SPACE_MAXADDR,		/* lowaddr */
5282	    BUS_SPACE_MAXADDR,		/* highaddr */
5283	    NULL, NULL,			/* filter, filterarg */
5284	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5285	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5286	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5287	    0,				/* flags */
5288	    NULL,			/* lockfunc */
5289	    NULL,			/* lockfuncarg */
5290	    &txr->hn_tx_data_dtag);
5291	if (error) {
5292		device_printf(dev, "failed to create data dmatag\n");
5293		return error;
5294	}
5295
5296	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5297		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5298
5299		txd->txr = txr;
5300		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5301		STAILQ_INIT(&txd->agg_list);
5302
5303		/*
5304		 * Allocate and load RNDIS packet message.
5305		 */
5306        	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5307		    (void **)&txd->rndis_pkt,
5308		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5309		    &txd->rndis_pkt_dmap);
5310		if (error) {
5311			device_printf(dev,
5312			    "failed to allocate rndis_packet_msg, %d\n", i);
5313			return error;
5314		}
5315
5316		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5317		    txd->rndis_pkt_dmap,
5318		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5319		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5320		    BUS_DMA_NOWAIT);
5321		if (error) {
5322			device_printf(dev,
5323			    "failed to load rndis_packet_msg, %d\n", i);
5324			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5325			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5326			return error;
5327		}
5328
5329		/* DMA map for TX data. */
5330		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5331		    &txd->data_dmap);
5332		if (error) {
5333			device_printf(dev,
5334			    "failed to allocate tx data dmamap\n");
5335			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5336			    txd->rndis_pkt_dmap);
5337			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5338			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5339			return error;
5340		}
5341
5342		/* All set, put it to list */
5343		txd->flags |= HN_TXD_FLAG_ONLIST;
5344#ifndef HN_USE_TXDESC_BUFRING
5345		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5346#else
5347		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5348#endif
5349	}
5350	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5351
5352	if (sc->hn_tx_sysctl_tree != NULL) {
5353		struct sysctl_oid_list *child;
5354		struct sysctl_ctx_list *ctx;
5355		char name[16];
5356
5357		/*
5358		 * Create per TX ring sysctl tree:
5359		 * dev.hn.UNIT.tx.RINGID
5360		 */
5361		ctx = device_get_sysctl_ctx(dev);
5362		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5363
5364		snprintf(name, sizeof(name), "%d", id);
5365		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5366		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5367
5368		if (txr->hn_tx_sysctl_tree != NULL) {
5369			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5370
5371#ifdef HN_DEBUG
5372			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5373			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5374			    "# of available TX descs");
5375#endif
5376#ifdef HN_IFSTART_SUPPORT
5377			if (!hn_use_if_start)
5378#endif
5379			{
5380				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5381				    CTLFLAG_RD, &txr->hn_oactive, 0,
5382				    "over active");
5383			}
5384			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5385			    CTLFLAG_RW, &txr->hn_pkts,
5386			    "# of packets transmitted");
5387			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5388			    CTLFLAG_RW, &txr->hn_sends, "# of sends");
5389		}
5390	}
5391
5392	return 0;
5393}
5394
5395static void
5396hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5397{
5398	struct hn_tx_ring *txr = txd->txr;
5399
5400	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5401	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5402
5403	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5404	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5405	    txd->rndis_pkt_dmap);
5406	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5407}
5408
5409static void
5410hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5411{
5412
5413	KASSERT(txd->refs == 0 || txd->refs == 1,
5414	    ("invalid txd refs %d", txd->refs));
5415
5416	/* Aggregated txds will be freed by their aggregating txd. */
5417	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5418		int freed;
5419
5420		freed = hn_txdesc_put(txr, txd);
5421		KASSERT(freed, ("can't free txdesc"));
5422	}
5423}
5424
5425static void
5426hn_tx_ring_destroy(struct hn_tx_ring *txr)
5427{
5428	int i;
5429
5430	if (txr->hn_txdesc == NULL)
5431		return;
5432
5433	/*
5434	 * NOTE:
5435	 * Because the freeing of aggregated txds will be deferred
5436	 * to the aggregating txd, two passes are used here:
5437	 * - The first pass GCes any pending txds.  This GC is necessary,
5438	 *   since if the channels are revoked, hypervisor will not
5439	 *   deliver send-done for all pending txds.
5440	 * - The second pass frees the busdma stuffs, i.e. after all txds
5441	 *   were freed.
5442	 */
5443	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5444		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5445	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5446		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5447
5448	if (txr->hn_tx_data_dtag != NULL)
5449		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5450	if (txr->hn_tx_rndis_dtag != NULL)
5451		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5452
5453#ifdef HN_USE_TXDESC_BUFRING
5454	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5455#endif
5456
5457	free(txr->hn_txdesc, M_DEVBUF);
5458	txr->hn_txdesc = NULL;
5459
5460	if (txr->hn_mbuf_br != NULL)
5461		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5462
5463#ifndef HN_USE_TXDESC_BUFRING
5464	mtx_destroy(&txr->hn_txlist_spin);
5465#endif
5466	mtx_destroy(&txr->hn_tx_lock);
5467}
5468
5469static int
5470hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5471{
5472	struct sysctl_oid_list *child;
5473	struct sysctl_ctx_list *ctx;
5474	int i;
5475
5476	/*
5477	 * Create TXBUF for chimney sending.
5478	 *
5479	 * NOTE: It is shared by all channels.
5480	 */
5481	sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5482	    PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5483	    BUS_DMA_WAITOK | BUS_DMA_ZERO);
5484	if (sc->hn_chim == NULL) {
5485		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5486		return (ENOMEM);
5487	}
5488
5489	sc->hn_tx_ring_cnt = ring_cnt;
5490	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5491
5492	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5493	    M_DEVBUF, M_WAITOK | M_ZERO);
5494
5495	ctx = device_get_sysctl_ctx(sc->hn_dev);
5496	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5497
5498	/* Create dev.hn.UNIT.tx sysctl tree */
5499	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5500	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5501
5502	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5503		int error;
5504
5505		error = hn_tx_ring_create(sc, i);
5506		if (error)
5507			return error;
5508	}
5509
5510	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5511	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5512	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5513	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5514	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5515	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5516	    __offsetof(struct hn_tx_ring, hn_send_failed),
5517	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5518	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5519	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5520	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5521	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5522	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5523	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5524	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5525	    hn_tx_stat_ulong_sysctl, "LU",
5526	    "# of packet transmission aggregation flush failure");
5527	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5528	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5529	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5530	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5531	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5532	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5533	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5534	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5535	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5536	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5537	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5538	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5539	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5540	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5541	    "# of total TX descs");
5542	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5543	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5544	    "Chimney send packet size upper boundary");
5545	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5546	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5547	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5548	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5549	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5550	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5551	    hn_tx_conf_int_sysctl, "I",
5552	    "Size of the packet for direct transmission");
5553	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5554	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5555	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5556	    hn_tx_conf_int_sysctl, "I",
5557	    "Always schedule transmission "
5558	    "instead of doing direct transmission");
5559	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5560	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5561	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5562	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5563	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5564	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5565	    "Applied packet transmission aggregation size");
5566	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5567	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5568	    hn_txagg_pktmax_sysctl, "I",
5569	    "Applied packet transmission aggregation packets");
5570	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5571	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5572	    hn_txagg_align_sysctl, "I",
5573	    "Applied packet transmission aggregation alignment");
5574
5575	return 0;
5576}
5577
5578static void
5579hn_set_chim_size(struct hn_softc *sc, int chim_size)
5580{
5581	int i;
5582
5583	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5584		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5585}
5586
5587static void
5588hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5589{
5590	struct ifnet *ifp = sc->hn_ifp;
5591	u_int hw_tsomax;
5592	int tso_minlen;
5593
5594	HN_LOCK_ASSERT(sc);
5595
5596	if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5597		return;
5598
5599	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5600	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5601	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5602
5603	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5604	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5605	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5606
5607	if (tso_maxlen < tso_minlen)
5608		tso_maxlen = tso_minlen;
5609	else if (tso_maxlen > IP_MAXPACKET)
5610		tso_maxlen = IP_MAXPACKET;
5611	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5612		tso_maxlen = sc->hn_ndis_tso_szmax;
5613	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5614
5615	if (hn_xpnt_vf_isready(sc)) {
5616		if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5617			hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5618	}
5619	ifp->if_hw_tsomax = hw_tsomax;
5620	if (bootverbose)
5621		if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5622}
5623
5624static void
5625hn_fixup_tx_data(struct hn_softc *sc)
5626{
5627	uint64_t csum_assist;
5628	int i;
5629
5630	hn_set_chim_size(sc, sc->hn_chim_szmax);
5631	if (hn_tx_chimney_size > 0 &&
5632	    hn_tx_chimney_size < sc->hn_chim_szmax)
5633		hn_set_chim_size(sc, hn_tx_chimney_size);
5634
5635	csum_assist = 0;
5636	if (sc->hn_caps & HN_CAP_IPCS)
5637		csum_assist |= CSUM_IP;
5638	if (sc->hn_caps & HN_CAP_TCP4CS)
5639		csum_assist |= CSUM_IP_TCP;
5640	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5641		csum_assist |= CSUM_IP_UDP;
5642	if (sc->hn_caps & HN_CAP_TCP6CS)
5643		csum_assist |= CSUM_IP6_TCP;
5644	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5645		csum_assist |= CSUM_IP6_UDP;
5646	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5647		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5648
5649	if (sc->hn_caps & HN_CAP_HASHVAL) {
5650		/*
5651		 * Support HASHVAL pktinfo on TX path.
5652		 */
5653		if (bootverbose)
5654			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5655		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5656			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5657	}
5658}
5659
5660static void
5661hn_fixup_rx_data(struct hn_softc *sc)
5662{
5663
5664	if (sc->hn_caps & HN_CAP_UDPHASH) {
5665		int i;
5666
5667		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5668			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5669	}
5670}
5671
5672static void
5673hn_destroy_tx_data(struct hn_softc *sc)
5674{
5675	int i;
5676
5677	if (sc->hn_chim != NULL) {
5678		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5679			hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5680		} else {
5681			device_printf(sc->hn_dev,
5682			    "chimney sending buffer is referenced");
5683		}
5684		sc->hn_chim = NULL;
5685	}
5686
5687	if (sc->hn_tx_ring_cnt == 0)
5688		return;
5689
5690	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5691		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5692
5693	free(sc->hn_tx_ring, M_DEVBUF);
5694	sc->hn_tx_ring = NULL;
5695
5696	sc->hn_tx_ring_cnt = 0;
5697	sc->hn_tx_ring_inuse = 0;
5698}
5699
5700#ifdef HN_IFSTART_SUPPORT
5701
5702static void
5703hn_start_taskfunc(void *xtxr, int pending __unused)
5704{
5705	struct hn_tx_ring *txr = xtxr;
5706
5707	mtx_lock(&txr->hn_tx_lock);
5708	hn_start_locked(txr, 0);
5709	mtx_unlock(&txr->hn_tx_lock);
5710}
5711
5712static int
5713hn_start_locked(struct hn_tx_ring *txr, int len)
5714{
5715	struct hn_softc *sc = txr->hn_sc;
5716	struct ifnet *ifp = sc->hn_ifp;
5717	int sched = 0;
5718
5719	KASSERT(hn_use_if_start,
5720	    ("hn_start_locked is called, when if_start is disabled"));
5721	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5722	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5723	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5724
5725	if (__predict_false(txr->hn_suspended))
5726		return (0);
5727
5728	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5729	    IFF_DRV_RUNNING)
5730		return (0);
5731
5732	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5733		struct hn_txdesc *txd;
5734		struct mbuf *m_head;
5735		int error;
5736
5737		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5738		if (m_head == NULL)
5739			break;
5740
5741		if (len > 0 && m_head->m_pkthdr.len > len) {
5742			/*
5743			 * This sending could be time consuming; let callers
5744			 * dispatch this packet sending (and sending of any
5745			 * following up packets) to tx taskqueue.
5746			 */
5747			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5748			sched = 1;
5749			break;
5750		}
5751
5752#if defined(INET6) || defined(INET)
5753		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5754			m_head = hn_tso_fixup(m_head);
5755			if (__predict_false(m_head == NULL)) {
5756				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5757				continue;
5758			}
5759		} else if (m_head->m_pkthdr.csum_flags &
5760		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5761			m_head = hn_set_hlen(m_head);
5762			if (__predict_false(m_head == NULL)) {
5763				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5764				continue;
5765			}
5766		}
5767#endif
5768
5769		txd = hn_txdesc_get(txr);
5770		if (txd == NULL) {
5771			txr->hn_no_txdescs++;
5772			IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5773			atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5774			break;
5775		}
5776
5777		error = hn_encap(ifp, txr, txd, &m_head);
5778		if (error) {
5779			/* Both txd and m_head are freed */
5780			KASSERT(txr->hn_agg_txd == NULL,
5781			    ("encap failed w/ pending aggregating txdesc"));
5782			continue;
5783		}
5784
5785		if (txr->hn_agg_pktleft == 0) {
5786			if (txr->hn_agg_txd != NULL) {
5787				KASSERT(m_head == NULL,
5788				    ("pending mbuf for aggregating txdesc"));
5789				error = hn_flush_txagg(ifp, txr);
5790				if (__predict_false(error)) {
5791					atomic_set_int(&ifp->if_drv_flags,
5792					    IFF_DRV_OACTIVE);
5793					break;
5794				}
5795			} else {
5796				KASSERT(m_head != NULL, ("mbuf was freed"));
5797				error = hn_txpkt(ifp, txr, txd);
5798				if (__predict_false(error)) {
5799					/* txd is freed, but m_head is not */
5800					IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5801					atomic_set_int(&ifp->if_drv_flags,
5802					    IFF_DRV_OACTIVE);
5803					break;
5804				}
5805			}
5806		}
5807#ifdef INVARIANTS
5808		else {
5809			KASSERT(txr->hn_agg_txd != NULL,
5810			    ("no aggregating txdesc"));
5811			KASSERT(m_head == NULL,
5812			    ("pending mbuf for aggregating txdesc"));
5813		}
5814#endif
5815	}
5816
5817	/* Flush pending aggerated transmission. */
5818	if (txr->hn_agg_txd != NULL)
5819		hn_flush_txagg(ifp, txr);
5820	return (sched);
5821}
5822
5823static void
5824hn_start(struct ifnet *ifp)
5825{
5826	struct hn_softc *sc = ifp->if_softc;
5827	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5828
5829	if (txr->hn_sched_tx)
5830		goto do_sched;
5831
5832	if (mtx_trylock(&txr->hn_tx_lock)) {
5833		int sched;
5834
5835		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5836		mtx_unlock(&txr->hn_tx_lock);
5837		if (!sched)
5838			return;
5839	}
5840do_sched:
5841	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5842}
5843
5844static void
5845hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5846{
5847	struct hn_tx_ring *txr = xtxr;
5848
5849	mtx_lock(&txr->hn_tx_lock);
5850	atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5851	hn_start_locked(txr, 0);
5852	mtx_unlock(&txr->hn_tx_lock);
5853}
5854
5855static void
5856hn_start_txeof(struct hn_tx_ring *txr)
5857{
5858	struct hn_softc *sc = txr->hn_sc;
5859	struct ifnet *ifp = sc->hn_ifp;
5860
5861	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5862
5863	if (txr->hn_sched_tx)
5864		goto do_sched;
5865
5866	if (mtx_trylock(&txr->hn_tx_lock)) {
5867		int sched;
5868
5869		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5870		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5871		mtx_unlock(&txr->hn_tx_lock);
5872		if (sched) {
5873			taskqueue_enqueue(txr->hn_tx_taskq,
5874			    &txr->hn_tx_task);
5875		}
5876	} else {
5877do_sched:
5878		/*
5879		 * Release the OACTIVE earlier, with the hope, that
5880		 * others could catch up.  The task will clear the
5881		 * flag again with the hn_tx_lock to avoid possible
5882		 * races.
5883		 */
5884		atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5885		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5886	}
5887}
5888
5889#endif	/* HN_IFSTART_SUPPORT */
5890
5891static int
5892hn_xmit(struct hn_tx_ring *txr, int len)
5893{
5894	struct hn_softc *sc = txr->hn_sc;
5895	struct ifnet *ifp = sc->hn_ifp;
5896	struct mbuf *m_head;
5897	int sched = 0;
5898
5899	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5900#ifdef HN_IFSTART_SUPPORT
5901	KASSERT(hn_use_if_start == 0,
5902	    ("hn_xmit is called, when if_start is enabled"));
5903#endif
5904	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5905
5906	if (__predict_false(txr->hn_suspended))
5907		return (0);
5908
5909	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5910		return (0);
5911
5912	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5913		struct hn_txdesc *txd;
5914		int error;
5915
5916		if (len > 0 && m_head->m_pkthdr.len > len) {
5917			/*
5918			 * This sending could be time consuming; let callers
5919			 * dispatch this packet sending (and sending of any
5920			 * following up packets) to tx taskqueue.
5921			 */
5922			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5923			sched = 1;
5924			break;
5925		}
5926
5927		txd = hn_txdesc_get(txr);
5928		if (txd == NULL) {
5929			txr->hn_no_txdescs++;
5930			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5931			txr->hn_oactive = 1;
5932			break;
5933		}
5934
5935		error = hn_encap(ifp, txr, txd, &m_head);
5936		if (error) {
5937			/* Both txd and m_head are freed; discard */
5938			KASSERT(txr->hn_agg_txd == NULL,
5939			    ("encap failed w/ pending aggregating txdesc"));
5940			drbr_advance(ifp, txr->hn_mbuf_br);
5941			continue;
5942		}
5943
5944		if (txr->hn_agg_pktleft == 0) {
5945			if (txr->hn_agg_txd != NULL) {
5946				KASSERT(m_head == NULL,
5947				    ("pending mbuf for aggregating txdesc"));
5948				error = hn_flush_txagg(ifp, txr);
5949				if (__predict_false(error)) {
5950					txr->hn_oactive = 1;
5951					break;
5952				}
5953			} else {
5954				KASSERT(m_head != NULL, ("mbuf was freed"));
5955				error = hn_txpkt(ifp, txr, txd);
5956				if (__predict_false(error)) {
5957					/* txd is freed, but m_head is not */
5958					drbr_putback(ifp, txr->hn_mbuf_br,
5959					    m_head);
5960					txr->hn_oactive = 1;
5961					break;
5962				}
5963			}
5964		}
5965#ifdef INVARIANTS
5966		else {
5967			KASSERT(txr->hn_agg_txd != NULL,
5968			    ("no aggregating txdesc"));
5969			KASSERT(m_head == NULL,
5970			    ("pending mbuf for aggregating txdesc"));
5971		}
5972#endif
5973
5974		/* Sent */
5975		drbr_advance(ifp, txr->hn_mbuf_br);
5976	}
5977
5978	/* Flush pending aggerated transmission. */
5979	if (txr->hn_agg_txd != NULL)
5980		hn_flush_txagg(ifp, txr);
5981	return (sched);
5982}
5983
5984static int
5985hn_transmit(struct ifnet *ifp, struct mbuf *m)
5986{
5987	struct hn_softc *sc = ifp->if_softc;
5988	struct hn_tx_ring *txr;
5989	int error, idx = 0;
5990
5991	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5992		struct rm_priotracker pt;
5993
5994		rm_rlock(&sc->hn_vf_lock, &pt);
5995		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5996			struct mbuf *m_bpf = NULL;
5997			int obytes, omcast;
5998
5999			obytes = m->m_pkthdr.len;
6000			omcast = (m->m_flags & M_MCAST) != 0;
6001
6002			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
6003				if (bpf_peers_present(ifp->if_bpf)) {
6004					m_bpf = m_copypacket(m, M_NOWAIT);
6005					if (m_bpf == NULL) {
6006						/*
6007						 * Failed to grab a shallow
6008						 * copy; tap now.
6009						 */
6010						ETHER_BPF_MTAP(ifp, m);
6011					}
6012				}
6013			} else {
6014				ETHER_BPF_MTAP(ifp, m);
6015			}
6016
6017			error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
6018			rm_runlock(&sc->hn_vf_lock, &pt);
6019
6020			if (m_bpf != NULL) {
6021				if (!error)
6022					ETHER_BPF_MTAP(ifp, m_bpf);
6023				m_freem(m_bpf);
6024			}
6025
6026			if (error == ENOBUFS) {
6027				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6028			} else if (error) {
6029				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6030			} else {
6031				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
6032				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
6033				if (omcast) {
6034					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
6035					    omcast);
6036				}
6037			}
6038			return (error);
6039		}
6040		rm_runlock(&sc->hn_vf_lock, &pt);
6041	}
6042
6043#if defined(INET6) || defined(INET)
6044	/*
6045	 * Perform TSO packet header fixup or get l2/l3 header length now,
6046	 * since packet headers should be cache-hot.
6047	 */
6048	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6049		m = hn_tso_fixup(m);
6050		if (__predict_false(m == NULL)) {
6051			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6052			return EIO;
6053		}
6054	} else if (m->m_pkthdr.csum_flags &
6055	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6056		m = hn_set_hlen(m);
6057		if (__predict_false(m == NULL)) {
6058			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6059			return EIO;
6060		}
6061	}
6062#endif
6063
6064	/*
6065	 * Select the TX ring based on flowid
6066	 */
6067	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6068#ifdef RSS
6069		uint32_t bid;
6070
6071		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6072		    &bid) == 0)
6073			idx = bid % sc->hn_tx_ring_inuse;
6074		else
6075#endif
6076		{
6077#if defined(INET6) || defined(INET)
6078			int tcpsyn = 0;
6079
6080			if (m->m_pkthdr.len < 128 &&
6081			    (m->m_pkthdr.csum_flags &
6082			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6083			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6084				m = hn_check_tcpsyn(m, &tcpsyn);
6085				if (__predict_false(m == NULL)) {
6086					if_inc_counter(ifp,
6087					    IFCOUNTER_OERRORS, 1);
6088					return (EIO);
6089				}
6090			}
6091#else
6092			const int tcpsyn = 0;
6093#endif
6094			if (tcpsyn)
6095				idx = 0;
6096			else
6097				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6098		}
6099	}
6100	txr = &sc->hn_tx_ring[idx];
6101
6102	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6103	if (error) {
6104		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6105		return error;
6106	}
6107
6108	if (txr->hn_oactive)
6109		return 0;
6110
6111	if (txr->hn_sched_tx)
6112		goto do_sched;
6113
6114	if (mtx_trylock(&txr->hn_tx_lock)) {
6115		int sched;
6116
6117		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6118		mtx_unlock(&txr->hn_tx_lock);
6119		if (!sched)
6120			return 0;
6121	}
6122do_sched:
6123	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6124	return 0;
6125}
6126
6127static void
6128hn_tx_ring_qflush(struct hn_tx_ring *txr)
6129{
6130	struct mbuf *m;
6131
6132	mtx_lock(&txr->hn_tx_lock);
6133	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6134		m_freem(m);
6135	mtx_unlock(&txr->hn_tx_lock);
6136}
6137
6138static void
6139hn_xmit_qflush(struct ifnet *ifp)
6140{
6141	struct hn_softc *sc = ifp->if_softc;
6142	struct rm_priotracker pt;
6143	int i;
6144
6145	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6146		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6147	if_qflush(ifp);
6148
6149	rm_rlock(&sc->hn_vf_lock, &pt);
6150	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6151		sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6152	rm_runlock(&sc->hn_vf_lock, &pt);
6153}
6154
6155static void
6156hn_xmit_txeof(struct hn_tx_ring *txr)
6157{
6158
6159	if (txr->hn_sched_tx)
6160		goto do_sched;
6161
6162	if (mtx_trylock(&txr->hn_tx_lock)) {
6163		int sched;
6164
6165		txr->hn_oactive = 0;
6166		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6167		mtx_unlock(&txr->hn_tx_lock);
6168		if (sched) {
6169			taskqueue_enqueue(txr->hn_tx_taskq,
6170			    &txr->hn_tx_task);
6171		}
6172	} else {
6173do_sched:
6174		/*
6175		 * Release the oactive earlier, with the hope, that
6176		 * others could catch up.  The task will clear the
6177		 * oactive again with the hn_tx_lock to avoid possible
6178		 * races.
6179		 */
6180		txr->hn_oactive = 0;
6181		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6182	}
6183}
6184
6185static void
6186hn_xmit_taskfunc(void *xtxr, int pending __unused)
6187{
6188	struct hn_tx_ring *txr = xtxr;
6189
6190	mtx_lock(&txr->hn_tx_lock);
6191	hn_xmit(txr, 0);
6192	mtx_unlock(&txr->hn_tx_lock);
6193}
6194
6195static void
6196hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6197{
6198	struct hn_tx_ring *txr = xtxr;
6199
6200	mtx_lock(&txr->hn_tx_lock);
6201	txr->hn_oactive = 0;
6202	hn_xmit(txr, 0);
6203	mtx_unlock(&txr->hn_tx_lock);
6204}
6205
6206static int
6207hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6208{
6209	struct vmbus_chan_br cbr;
6210	struct hn_rx_ring *rxr;
6211	struct hn_tx_ring *txr = NULL;
6212	int idx, error;
6213
6214	idx = vmbus_chan_subidx(chan);
6215
6216	/*
6217	 * Link this channel to RX/TX ring.
6218	 */
6219	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6220	    ("invalid channel index %d, should > 0 && < %d",
6221	     idx, sc->hn_rx_ring_inuse));
6222	rxr = &sc->hn_rx_ring[idx];
6223	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6224	    ("RX ring %d already attached", idx));
6225	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6226	rxr->hn_chan = chan;
6227
6228	if (bootverbose) {
6229		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6230		    idx, vmbus_chan_id(chan));
6231	}
6232
6233	if (idx < sc->hn_tx_ring_inuse) {
6234		txr = &sc->hn_tx_ring[idx];
6235		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6236		    ("TX ring %d already attached", idx));
6237		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6238
6239		txr->hn_chan = chan;
6240		if (bootverbose) {
6241			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6242			    idx, vmbus_chan_id(chan));
6243		}
6244	}
6245
6246	/* Bind this channel to a proper CPU. */
6247	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6248
6249	/*
6250	 * Open this channel
6251	 */
6252	cbr.cbr = rxr->hn_br;
6253	cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6254	cbr.cbr_txsz = HN_TXBR_SIZE;
6255	cbr.cbr_rxsz = HN_RXBR_SIZE;
6256	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6257	if (error) {
6258		if (error == EISCONN) {
6259			if_printf(sc->hn_ifp, "bufring is connected after "
6260			    "chan%u open failure\n", vmbus_chan_id(chan));
6261			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6262		} else {
6263			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6264			    vmbus_chan_id(chan), error);
6265		}
6266	}
6267	return (error);
6268}
6269
6270static void
6271hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6272{
6273	struct hn_rx_ring *rxr;
6274	int idx, error;
6275
6276	idx = vmbus_chan_subidx(chan);
6277
6278	/*
6279	 * Link this channel to RX/TX ring.
6280	 */
6281	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6282	    ("invalid channel index %d, should > 0 && < %d",
6283	     idx, sc->hn_rx_ring_inuse));
6284	rxr = &sc->hn_rx_ring[idx];
6285	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6286	    ("RX ring %d is not attached", idx));
6287	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6288
6289	if (idx < sc->hn_tx_ring_inuse) {
6290		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6291
6292		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6293		    ("TX ring %d is not attached attached", idx));
6294		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6295	}
6296
6297	/*
6298	 * Close this channel.
6299	 *
6300	 * NOTE:
6301	 * Channel closing does _not_ destroy the target channel.
6302	 */
6303	error = vmbus_chan_close_direct(chan);
6304	if (error == EISCONN) {
6305		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6306		    "after being closed\n", vmbus_chan_id(chan));
6307		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6308	} else if (error) {
6309		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6310		    vmbus_chan_id(chan), error);
6311	}
6312}
6313
6314static int
6315hn_attach_subchans(struct hn_softc *sc)
6316{
6317	struct vmbus_channel **subchans;
6318	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6319	int i, error = 0;
6320
6321	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6322
6323	/* Attach the sub-channels. */
6324	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6325	for (i = 0; i < subchan_cnt; ++i) {
6326		int error1;
6327
6328		error1 = hn_chan_attach(sc, subchans[i]);
6329		if (error1) {
6330			error = error1;
6331			/* Move on; all channels will be detached later. */
6332		}
6333	}
6334	vmbus_subchan_rel(subchans, subchan_cnt);
6335
6336	if (error) {
6337		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6338	} else {
6339		if (bootverbose) {
6340			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6341			    subchan_cnt);
6342		}
6343	}
6344	return (error);
6345}
6346
6347static void
6348hn_detach_allchans(struct hn_softc *sc)
6349{
6350	struct vmbus_channel **subchans;
6351	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6352	int i;
6353
6354	if (subchan_cnt == 0)
6355		goto back;
6356
6357	/* Detach the sub-channels. */
6358	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6359	for (i = 0; i < subchan_cnt; ++i)
6360		hn_chan_detach(sc, subchans[i]);
6361	vmbus_subchan_rel(subchans, subchan_cnt);
6362
6363back:
6364	/*
6365	 * Detach the primary channel, _after_ all sub-channels
6366	 * are detached.
6367	 */
6368	hn_chan_detach(sc, sc->hn_prichan);
6369
6370	/* Wait for sub-channels to be destroyed, if any. */
6371	vmbus_subchan_drain(sc->hn_prichan);
6372
6373#ifdef INVARIANTS
6374	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6375		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6376		    HN_RX_FLAG_ATTACHED) == 0,
6377		    ("%dth RX ring is still attached", i));
6378	}
6379	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6380		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6381		    HN_TX_FLAG_ATTACHED) == 0,
6382		    ("%dth TX ring is still attached", i));
6383	}
6384#endif
6385}
6386
6387static int
6388hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6389{
6390	struct vmbus_channel **subchans;
6391	int nchan, rxr_cnt, error;
6392
6393	nchan = *nsubch + 1;
6394	if (nchan == 1) {
6395		/*
6396		 * Multiple RX/TX rings are not requested.
6397		 */
6398		*nsubch = 0;
6399		return (0);
6400	}
6401
6402	/*
6403	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6404	 * table entries.
6405	 */
6406	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6407	if (error) {
6408		/* No RSS; this is benign. */
6409		*nsubch = 0;
6410		return (0);
6411	}
6412	if (bootverbose) {
6413		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6414		    rxr_cnt, nchan);
6415	}
6416
6417	if (nchan > rxr_cnt)
6418		nchan = rxr_cnt;
6419	if (nchan == 1) {
6420		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6421		*nsubch = 0;
6422		return (0);
6423	}
6424
6425	/*
6426	 * Allocate sub-channels from NVS.
6427	 */
6428	*nsubch = nchan - 1;
6429	error = hn_nvs_alloc_subchans(sc, nsubch);
6430	if (error || *nsubch == 0) {
6431		/* Failed to allocate sub-channels. */
6432		*nsubch = 0;
6433		return (0);
6434	}
6435
6436	/*
6437	 * Wait for all sub-channels to become ready before moving on.
6438	 */
6439	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6440	vmbus_subchan_rel(subchans, *nsubch);
6441	return (0);
6442}
6443
6444static bool
6445hn_synth_attachable(const struct hn_softc *sc)
6446{
6447	int i;
6448
6449	if (sc->hn_flags & HN_FLAG_ERRORS)
6450		return (false);
6451
6452	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6453		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6454
6455		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6456			return (false);
6457	}
6458	return (true);
6459}
6460
6461/*
6462 * Make sure that the RX filter is zero after the successful
6463 * RNDIS initialization.
6464 *
6465 * NOTE:
6466 * Under certain conditions on certain versions of Hyper-V,
6467 * the RNDIS rxfilter is _not_ zero on the hypervisor side
6468 * after the successful RNDIS initialization, which breaks
6469 * the assumption of any following code (well, it breaks the
6470 * RNDIS API contract actually).  Clear the RNDIS rxfilter
6471 * explicitly, drain packets sneaking through, and drain the
6472 * interrupt taskqueues scheduled due to the stealth packets.
6473 */
6474static void
6475hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6476{
6477
6478	hn_disable_rx(sc);
6479	hn_drain_rxtx(sc, nchan);
6480}
6481
6482static int
6483hn_synth_attach(struct hn_softc *sc, int mtu)
6484{
6485#define ATTACHED_NVS		0x0002
6486#define ATTACHED_RNDIS		0x0004
6487
6488	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6489	int error, nsubch, nchan = 1, i, rndis_inited;
6490	uint32_t old_caps, attached = 0;
6491
6492	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6493	    ("synthetic parts were attached"));
6494
6495	if (!hn_synth_attachable(sc))
6496		return (ENXIO);
6497
6498	/* Save capabilities for later verification. */
6499	old_caps = sc->hn_caps;
6500	sc->hn_caps = 0;
6501
6502	/* Clear RSS stuffs. */
6503	sc->hn_rss_ind_size = 0;
6504	sc->hn_rss_hash = 0;
6505	sc->hn_rss_hcap = 0;
6506
6507	/*
6508	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6509	 */
6510	error = hn_chan_attach(sc, sc->hn_prichan);
6511	if (error)
6512		goto failed;
6513
6514	/*
6515	 * Attach NVS.
6516	 */
6517	error = hn_nvs_attach(sc, mtu);
6518	if (error)
6519		goto failed;
6520	attached |= ATTACHED_NVS;
6521
6522	/*
6523	 * Attach RNDIS _after_ NVS is attached.
6524	 */
6525	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6526	if (rndis_inited)
6527		attached |= ATTACHED_RNDIS;
6528	if (error)
6529		goto failed;
6530
6531	/*
6532	 * Make sure capabilities are not changed.
6533	 */
6534	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6535		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6536		    old_caps, sc->hn_caps);
6537		error = ENXIO;
6538		goto failed;
6539	}
6540
6541	/*
6542	 * Allocate sub-channels for multi-TX/RX rings.
6543	 *
6544	 * NOTE:
6545	 * The # of RX rings that can be used is equivalent to the # of
6546	 * channels to be requested.
6547	 */
6548	nsubch = sc->hn_rx_ring_cnt - 1;
6549	error = hn_synth_alloc_subchans(sc, &nsubch);
6550	if (error)
6551		goto failed;
6552	/* NOTE: _Full_ synthetic parts detach is required now. */
6553	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6554
6555	/*
6556	 * Set the # of TX/RX rings that could be used according to
6557	 * the # of channels that NVS offered.
6558	 */
6559	nchan = nsubch + 1;
6560	hn_set_ring_inuse(sc, nchan);
6561	if (nchan == 1) {
6562		/* Only the primary channel can be used; done */
6563		goto back;
6564	}
6565
6566	/*
6567	 * Attach the sub-channels.
6568	 *
6569	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6570	 */
6571	error = hn_attach_subchans(sc);
6572	if (error)
6573		goto failed;
6574
6575	/*
6576	 * Configure RSS key and indirect table _after_ all sub-channels
6577	 * are attached.
6578	 */
6579	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6580		/*
6581		 * RSS key is not set yet; set it to the default RSS key.
6582		 */
6583		if (bootverbose)
6584			if_printf(sc->hn_ifp, "setup default RSS key\n");
6585#ifdef RSS
6586		rss_getkey(rss->rss_key);
6587#else
6588		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6589#endif
6590		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6591	}
6592
6593	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6594		/*
6595		 * RSS indirect table is not set yet; set it up in round-
6596		 * robin fashion.
6597		 */
6598		if (bootverbose) {
6599			if_printf(sc->hn_ifp, "setup default RSS indirect "
6600			    "table\n");
6601		}
6602		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6603			uint32_t subidx;
6604
6605#ifdef RSS
6606			subidx = rss_get_indirection_to_bucket(i);
6607#else
6608			subidx = i;
6609#endif
6610			rss->rss_ind[i] = subidx % nchan;
6611		}
6612		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6613	} else {
6614		/*
6615		 * # of usable channels may be changed, so we have to
6616		 * make sure that all entries in RSS indirect table
6617		 * are valid.
6618		 *
6619		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6620		 */
6621		hn_rss_ind_fixup(sc);
6622	}
6623
6624	sc->hn_rss_hash = sc->hn_rss_hcap;
6625	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6626	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6627		/* NOTE: Don't reconfigure RSS; will do immediately. */
6628		hn_vf_rss_fixup(sc, false);
6629	}
6630	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6631	if (error)
6632		goto failed;
6633back:
6634	/*
6635	 * Fixup transmission aggregation setup.
6636	 */
6637	hn_set_txagg(sc);
6638	hn_rndis_init_fixat(sc, nchan);
6639	return (0);
6640
6641failed:
6642	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6643		hn_rndis_init_fixat(sc, nchan);
6644		hn_synth_detach(sc);
6645	} else {
6646		if (attached & ATTACHED_RNDIS) {
6647			hn_rndis_init_fixat(sc, nchan);
6648			hn_rndis_detach(sc);
6649		}
6650		if (attached & ATTACHED_NVS)
6651			hn_nvs_detach(sc);
6652		hn_chan_detach(sc, sc->hn_prichan);
6653		/* Restore old capabilities. */
6654		sc->hn_caps = old_caps;
6655	}
6656	return (error);
6657
6658#undef ATTACHED_RNDIS
6659#undef ATTACHED_NVS
6660}
6661
6662/*
6663 * NOTE:
6664 * The interface must have been suspended though hn_suspend(), before
6665 * this function get called.
6666 */
6667static void
6668hn_synth_detach(struct hn_softc *sc)
6669{
6670
6671	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6672	    ("synthetic parts were not attached"));
6673
6674	/* Detach the RNDIS first. */
6675	hn_rndis_detach(sc);
6676
6677	/* Detach NVS. */
6678	hn_nvs_detach(sc);
6679
6680	/* Detach all of the channels. */
6681	hn_detach_allchans(sc);
6682
6683	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6684		/*
6685		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6686		 */
6687		int error;
6688
6689		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6690		    sc->hn_rxbuf_gpadl);
6691		if (error) {
6692			if_printf(sc->hn_ifp,
6693			    "rxbuf gpadl disconn failed: %d\n", error);
6694			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6695		}
6696		sc->hn_rxbuf_gpadl = 0;
6697	}
6698
6699	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6700		/*
6701		 * Host is post-Win2016, disconnect chimney sending buffer from
6702		 * primary channel here.
6703		 */
6704		int error;
6705
6706		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6707		    sc->hn_chim_gpadl);
6708		if (error) {
6709			if_printf(sc->hn_ifp,
6710			    "chim gpadl disconn failed: %d\n", error);
6711			sc->hn_flags |= HN_FLAG_CHIM_REF;
6712		}
6713		sc->hn_chim_gpadl = 0;
6714	}
6715	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6716}
6717
6718static void
6719hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6720{
6721	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6722	    ("invalid ring count %d", ring_cnt));
6723
6724	if (sc->hn_tx_ring_cnt > ring_cnt)
6725		sc->hn_tx_ring_inuse = ring_cnt;
6726	else
6727		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6728	sc->hn_rx_ring_inuse = ring_cnt;
6729
6730#ifdef RSS
6731	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6732		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6733		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6734		    rss_getnumbuckets());
6735	}
6736#endif
6737
6738	if (bootverbose) {
6739		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6740		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6741	}
6742}
6743
6744static void
6745hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6746{
6747
6748	/*
6749	 * NOTE:
6750	 * The TX bufring will not be drained by the hypervisor,
6751	 * if the primary channel is revoked.
6752	 */
6753	while (!vmbus_chan_rx_empty(chan) ||
6754	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6755	     !vmbus_chan_tx_empty(chan)))
6756		pause("waitch", 1);
6757	vmbus_chan_intr_drain(chan);
6758}
6759
6760static void
6761hn_disable_rx(struct hn_softc *sc)
6762{
6763
6764	/*
6765	 * Disable RX by clearing RX filter forcefully.
6766	 */
6767	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6768	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6769
6770	/*
6771	 * Give RNDIS enough time to flush all pending data packets.
6772	 */
6773	pause("waitrx", (200 * hz) / 1000);
6774}
6775
6776/*
6777 * NOTE:
6778 * RX/TX _must_ have been suspended/disabled, before this function
6779 * is called.
6780 */
6781static void
6782hn_drain_rxtx(struct hn_softc *sc, int nchan)
6783{
6784	struct vmbus_channel **subch = NULL;
6785	int nsubch;
6786
6787	/*
6788	 * Drain RX/TX bufrings and interrupts.
6789	 */
6790	nsubch = nchan - 1;
6791	if (nsubch > 0)
6792		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6793
6794	if (subch != NULL) {
6795		int i;
6796
6797		for (i = 0; i < nsubch; ++i)
6798			hn_chan_drain(sc, subch[i]);
6799	}
6800	hn_chan_drain(sc, sc->hn_prichan);
6801
6802	if (subch != NULL)
6803		vmbus_subchan_rel(subch, nsubch);
6804}
6805
6806static void
6807hn_suspend_data(struct hn_softc *sc)
6808{
6809	struct hn_tx_ring *txr;
6810	int i;
6811
6812	HN_LOCK_ASSERT(sc);
6813
6814	/*
6815	 * Suspend TX.
6816	 */
6817	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6818		txr = &sc->hn_tx_ring[i];
6819
6820		mtx_lock(&txr->hn_tx_lock);
6821		txr->hn_suspended = 1;
6822		mtx_unlock(&txr->hn_tx_lock);
6823		/* No one is able send more packets now. */
6824
6825		/*
6826		 * Wait for all pending sends to finish.
6827		 *
6828		 * NOTE:
6829		 * We will _not_ receive all pending send-done, if the
6830		 * primary channel is revoked.
6831		 */
6832		while (hn_tx_ring_pending(txr) &&
6833		    !vmbus_chan_is_revoked(sc->hn_prichan))
6834			pause("hnwtx", 1 /* 1 tick */);
6835	}
6836
6837	/*
6838	 * Disable RX.
6839	 */
6840	hn_disable_rx(sc);
6841
6842	/*
6843	 * Drain RX/TX.
6844	 */
6845	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6846
6847	/*
6848	 * Drain any pending TX tasks.
6849	 *
6850	 * NOTE:
6851	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6852	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6853	 */
6854	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6855		txr = &sc->hn_tx_ring[i];
6856
6857		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6858		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6859	}
6860}
6861
6862static void
6863hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6864{
6865
6866	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6867}
6868
6869static void
6870hn_suspend_mgmt(struct hn_softc *sc)
6871{
6872	struct task task;
6873
6874	HN_LOCK_ASSERT(sc);
6875
6876	/*
6877	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6878	 * through hn_mgmt_taskq.
6879	 */
6880	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6881	vmbus_chan_run_task(sc->hn_prichan, &task);
6882
6883	/*
6884	 * Make sure that all pending management tasks are completed.
6885	 */
6886	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6887	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6888	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6889}
6890
6891static void
6892hn_suspend(struct hn_softc *sc)
6893{
6894
6895	/* Disable polling. */
6896	hn_polling(sc, 0);
6897
6898	/*
6899	 * If the non-transparent mode VF is activated, the synthetic
6900	 * device is receiving packets, so the data path of the
6901	 * synthetic device must be suspended.
6902	 */
6903	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6904	    (sc->hn_flags & HN_FLAG_RXVF))
6905		hn_suspend_data(sc);
6906	hn_suspend_mgmt(sc);
6907}
6908
6909static void
6910hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6911{
6912	int i;
6913
6914	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6915	    ("invalid TX ring count %d", tx_ring_cnt));
6916
6917	for (i = 0; i < tx_ring_cnt; ++i) {
6918		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6919
6920		mtx_lock(&txr->hn_tx_lock);
6921		txr->hn_suspended = 0;
6922		mtx_unlock(&txr->hn_tx_lock);
6923	}
6924}
6925
6926static void
6927hn_resume_data(struct hn_softc *sc)
6928{
6929	int i;
6930
6931	HN_LOCK_ASSERT(sc);
6932
6933	/*
6934	 * Re-enable RX.
6935	 */
6936	hn_rxfilter_config(sc);
6937
6938	/*
6939	 * Make sure to clear suspend status on "all" TX rings,
6940	 * since hn_tx_ring_inuse can be changed after
6941	 * hn_suspend_data().
6942	 */
6943	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6944
6945#ifdef HN_IFSTART_SUPPORT
6946	if (!hn_use_if_start)
6947#endif
6948	{
6949		/*
6950		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6951		 * reduced.
6952		 */
6953		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6954			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6955	}
6956
6957	/*
6958	 * Kick start TX.
6959	 */
6960	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6961		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6962
6963		/*
6964		 * Use txeof task, so that any pending oactive can be
6965		 * cleared properly.
6966		 */
6967		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6968	}
6969}
6970
6971static void
6972hn_resume_mgmt(struct hn_softc *sc)
6973{
6974
6975	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6976
6977	/*
6978	 * Kick off network change detection, if it was pending.
6979	 * If no network change was pending, start link status
6980	 * checks, which is more lightweight than network change
6981	 * detection.
6982	 */
6983	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6984		hn_change_network(sc);
6985	else
6986		hn_update_link_status(sc);
6987}
6988
6989static void
6990hn_resume(struct hn_softc *sc)
6991{
6992
6993	/*
6994	 * If the non-transparent mode VF is activated, the synthetic
6995	 * device have to receive packets, so the data path of the
6996	 * synthetic device must be resumed.
6997	 */
6998	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6999	    (sc->hn_flags & HN_FLAG_RXVF))
7000		hn_resume_data(sc);
7001
7002	/*
7003	 * Don't resume link status change if VF is attached/activated.
7004	 * - In the non-transparent VF mode, the synthetic device marks
7005	 *   link down until the VF is deactivated; i.e. VF is down.
7006	 * - In transparent VF mode, VF's media status is used until
7007	 *   the VF is detached.
7008	 */
7009	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
7010	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
7011		hn_resume_mgmt(sc);
7012
7013	/*
7014	 * Re-enable polling if this interface is running and
7015	 * the polling is requested.
7016	 */
7017	if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
7018		hn_polling(sc, sc->hn_pollhz);
7019}
7020
7021static void
7022hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
7023{
7024	const struct rndis_status_msg *msg;
7025	int ofs;
7026
7027	if (dlen < sizeof(*msg)) {
7028		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
7029		return;
7030	}
7031	msg = data;
7032
7033	switch (msg->rm_status) {
7034	case RNDIS_STATUS_MEDIA_CONNECT:
7035	case RNDIS_STATUS_MEDIA_DISCONNECT:
7036		hn_update_link_status(sc);
7037		break;
7038
7039	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
7040	case RNDIS_STATUS_LINK_SPEED_CHANGE:
7041		/* Not really useful; ignore. */
7042		break;
7043
7044	case RNDIS_STATUS_NETWORK_CHANGE:
7045		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7046		if (dlen < ofs + msg->rm_stbuflen ||
7047		    msg->rm_stbuflen < sizeof(uint32_t)) {
7048			if_printf(sc->hn_ifp, "network changed\n");
7049		} else {
7050			uint32_t change;
7051
7052			memcpy(&change, ((const uint8_t *)msg) + ofs,
7053			    sizeof(change));
7054			if_printf(sc->hn_ifp, "network changed, change %u\n",
7055			    change);
7056		}
7057		hn_change_network(sc);
7058		break;
7059
7060	default:
7061		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7062		    msg->rm_status);
7063		break;
7064	}
7065}
7066
7067static int
7068hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7069{
7070	const struct rndis_pktinfo *pi = info_data;
7071	uint32_t mask = 0;
7072
7073	while (info_dlen != 0) {
7074		const void *data;
7075		uint32_t dlen;
7076
7077		if (__predict_false(info_dlen < sizeof(*pi)))
7078			return (EINVAL);
7079		if (__predict_false(info_dlen < pi->rm_size))
7080			return (EINVAL);
7081		info_dlen -= pi->rm_size;
7082
7083		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7084			return (EINVAL);
7085		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7086			return (EINVAL);
7087		dlen = pi->rm_size - pi->rm_pktinfooffset;
7088		data = pi->rm_data;
7089
7090		if (pi->rm_internal == 1) {
7091			switch (pi->rm_type) {
7092			case NDIS_PKTINFO_IT_PKTINFO_ID:
7093				if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7094					return (EINVAL);
7095				info->pktinfo_id =
7096				    (const struct packet_info_id *)data;
7097				mask |= HN_RXINFO_PKTINFO_ID;
7098				break;
7099
7100			default:
7101				goto next;
7102			}
7103		} else {
7104			switch (pi->rm_type) {
7105			case NDIS_PKTINFO_TYPE_VLAN:
7106				if (__predict_false(dlen
7107				    < NDIS_VLAN_INFO_SIZE))
7108					return (EINVAL);
7109				info->vlan_info = (const uint32_t *)data;
7110				mask |= HN_RXINFO_VLAN;
7111				break;
7112
7113			case NDIS_PKTINFO_TYPE_CSUM:
7114				if (__predict_false(dlen
7115				    < NDIS_RXCSUM_INFO_SIZE))
7116					return (EINVAL);
7117				info->csum_info = (const uint32_t *)data;
7118				mask |= HN_RXINFO_CSUM;
7119				break;
7120
7121			case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7122				if (__predict_false(dlen
7123				    < HN_NDIS_HASH_VALUE_SIZE))
7124					return (EINVAL);
7125				info->hash_value = (const uint32_t *)data;
7126				mask |= HN_RXINFO_HASHVAL;
7127				break;
7128
7129			case HN_NDIS_PKTINFO_TYPE_HASHINF:
7130				if (__predict_false(dlen
7131				    < HN_NDIS_HASH_INFO_SIZE))
7132					return (EINVAL);
7133				info->hash_info = (const uint32_t *)data;
7134				mask |= HN_RXINFO_HASHINF;
7135				break;
7136
7137			default:
7138				goto next;
7139			}
7140		}
7141
7142		if (mask == HN_RXINFO_ALL) {
7143			/* All found; done */
7144			break;
7145		}
7146next:
7147		pi = (const struct rndis_pktinfo *)
7148		    ((const uint8_t *)pi + pi->rm_size);
7149	}
7150
7151	/*
7152	 * Final fixup.
7153	 * - If there is no hash value, invalidate the hash info.
7154	 */
7155	if ((mask & HN_RXINFO_HASHVAL) == 0)
7156		info->hash_info = NULL;
7157	return (0);
7158}
7159
7160static __inline bool
7161hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7162{
7163
7164	if (off < check_off) {
7165		if (__predict_true(off + len <= check_off))
7166			return (false);
7167	} else if (off > check_off) {
7168		if (__predict_true(check_off + check_len <= off))
7169			return (false);
7170	}
7171	return (true);
7172}
7173
7174static __inline void
7175hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7176		uint32_t len, struct hn_rxinfo *info)
7177{
7178	uint32_t cnt = rxr->rsc.cnt;
7179
7180	if (cnt) {
7181		rxr->rsc.pktlen += len;
7182	} else {
7183		rxr->rsc.vlan_info = info->vlan_info;
7184		rxr->rsc.csum_info = info->csum_info;
7185		rxr->rsc.hash_info = info->hash_info;
7186		rxr->rsc.hash_value = info->hash_value;
7187		rxr->rsc.pktlen = len;
7188	}
7189
7190	rxr->rsc.frag_data[cnt] = data;
7191	rxr->rsc.frag_len[cnt] = len;
7192	rxr->rsc.cnt++;
7193}
7194
7195static void
7196hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7197{
7198	const struct rndis_packet_msg *pkt;
7199	struct hn_rxinfo info;
7200	int data_off, pktinfo_off, data_len, pktinfo_len;
7201	bool rsc_more= false;
7202
7203	/*
7204	 * Check length.
7205	 */
7206	if (__predict_false(dlen < sizeof(*pkt))) {
7207		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7208		return;
7209	}
7210	pkt = data;
7211
7212	if (__predict_false(dlen < pkt->rm_len)) {
7213		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7214		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7215		return;
7216	}
7217	if (__predict_false(pkt->rm_len <
7218	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7219		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7220		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7221		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7222		    pkt->rm_pktinfolen);
7223		return;
7224	}
7225	if (__predict_false(pkt->rm_datalen == 0)) {
7226		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7227		return;
7228	}
7229
7230	/*
7231	 * Check offests.
7232	 */
7233#define IS_OFFSET_INVALID(ofs)			\
7234	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7235	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7236
7237	/* XXX Hyper-V does not meet data offset alignment requirement */
7238	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7239		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7240		    "data offset %u\n", pkt->rm_dataoffset);
7241		return;
7242	}
7243	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7244	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7245		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7246		    "oob offset %u\n", pkt->rm_oobdataoffset);
7247		return;
7248	}
7249	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7250	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7251		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7252		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7253		return;
7254	}
7255
7256#undef IS_OFFSET_INVALID
7257
7258	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7259	data_len = pkt->rm_datalen;
7260	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7261	pktinfo_len = pkt->rm_pktinfolen;
7262
7263	/*
7264	 * Check OOB coverage.
7265	 */
7266	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7267		int oob_off, oob_len;
7268
7269		if_printf(rxr->hn_ifp, "got oobdata\n");
7270		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7271		oob_len = pkt->rm_oobdatalen;
7272
7273		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7274			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7275			    "oob overflow, msglen %u, oob abs %d len %d\n",
7276			    pkt->rm_len, oob_off, oob_len);
7277			return;
7278		}
7279
7280		/*
7281		 * Check against data.
7282		 */
7283		if (hn_rndis_check_overlap(oob_off, oob_len,
7284		    data_off, data_len)) {
7285			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7286			    "oob overlaps data, oob abs %d len %d, "
7287			    "data abs %d len %d\n",
7288			    oob_off, oob_len, data_off, data_len);
7289			return;
7290		}
7291
7292		/*
7293		 * Check against pktinfo.
7294		 */
7295		if (pktinfo_len != 0 &&
7296		    hn_rndis_check_overlap(oob_off, oob_len,
7297		    pktinfo_off, pktinfo_len)) {
7298			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7299			    "oob overlaps pktinfo, oob abs %d len %d, "
7300			    "pktinfo abs %d len %d\n",
7301			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7302			return;
7303		}
7304	}
7305
7306	/*
7307	 * Check per-packet-info coverage and find useful per-packet-info.
7308	 */
7309	info.vlan_info = NULL;
7310	info.csum_info = NULL;
7311	info.hash_info = NULL;
7312	info.pktinfo_id = NULL;
7313
7314	if (__predict_true(pktinfo_len != 0)) {
7315		bool overlap;
7316		int error;
7317
7318		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7319			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7320			    "pktinfo overflow, msglen %u, "
7321			    "pktinfo abs %d len %d\n",
7322			    pkt->rm_len, pktinfo_off, pktinfo_len);
7323			return;
7324		}
7325
7326		/*
7327		 * Check packet info coverage.
7328		 */
7329		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7330		    data_off, data_len);
7331		if (__predict_false(overlap)) {
7332			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7333			    "pktinfo overlap data, pktinfo abs %d len %d, "
7334			    "data abs %d len %d\n",
7335			    pktinfo_off, pktinfo_len, data_off, data_len);
7336			return;
7337		}
7338
7339		/*
7340		 * Find useful per-packet-info.
7341		 */
7342		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7343		    pktinfo_len, &info);
7344		if (__predict_false(error)) {
7345			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7346			    "pktinfo\n");
7347			return;
7348		}
7349	}
7350
7351	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7352		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7353		    "data overflow, msglen %u, data abs %d len %d\n",
7354		    pkt->rm_len, data_off, data_len);
7355		return;
7356	}
7357
7358	/* Identify RSC fragments, drop invalid packets */
7359	if ((info.pktinfo_id != NULL) &&
7360	    (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7361		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7362			rxr->rsc.cnt = 0;
7363			rxr->hn_rsc_pkts++;
7364		} else if (rxr->rsc.cnt == 0)
7365			goto drop;
7366
7367		rsc_more = true;
7368
7369		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7370			rsc_more = false;
7371
7372		if (rsc_more && rxr->rsc.is_last)
7373			goto drop;
7374	} else {
7375		rxr->rsc.cnt = 0;
7376	}
7377
7378	if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7379		goto drop;
7380
7381	/* Store data in per rx ring structure */
7382	hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7383	    data_len, &info);
7384
7385	if (rsc_more)
7386		return;
7387
7388	hn_rxpkt(rxr);
7389	rxr->rsc.cnt = 0;
7390	return;
7391drop:
7392	rxr->hn_rsc_drop++;
7393	return;
7394}
7395
7396static __inline void
7397hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7398{
7399	const struct rndis_msghdr *hdr;
7400
7401	if (__predict_false(dlen < sizeof(*hdr))) {
7402		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7403		return;
7404	}
7405	hdr = data;
7406
7407	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7408		/* Hot data path. */
7409		hn_rndis_rx_data(rxr, data, dlen);
7410		/* Done! */
7411		return;
7412	}
7413
7414	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7415		hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7416	else
7417		hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7418}
7419
7420static void
7421hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7422{
7423	const struct hn_nvs_hdr *hdr;
7424
7425	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7426		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7427		return;
7428	}
7429	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7430
7431	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7432		/* Useless; ignore */
7433		return;
7434	}
7435	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7436}
7437
7438static void
7439hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7440    const struct vmbus_chanpkt_hdr *pkt)
7441{
7442	struct hn_nvs_sendctx *sndc;
7443
7444	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7445	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7446	    VMBUS_CHANPKT_DATALEN(pkt));
7447	/*
7448	 * NOTE:
7449	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7450	 * its callback.
7451	 */
7452}
7453
7454static void
7455hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7456    const struct vmbus_chanpkt_hdr *pkthdr)
7457{
7458	const struct vmbus_chanpkt_rxbuf *pkt;
7459	const struct hn_nvs_hdr *nvs_hdr;
7460	int count, i, hlen;
7461
7462	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7463		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7464		return;
7465	}
7466	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7467
7468	/* Make sure that this is a RNDIS message. */
7469	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7470		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7471		    nvs_hdr->nvs_type);
7472		return;
7473	}
7474
7475	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7476	if (__predict_false(hlen < sizeof(*pkt))) {
7477		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7478		return;
7479	}
7480	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7481
7482	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7483		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7484		    pkt->cp_rxbuf_id);
7485		return;
7486	}
7487
7488	count = pkt->cp_rxbuf_cnt;
7489	if (__predict_false(hlen <
7490	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7491		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7492		return;
7493	}
7494
7495	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7496	for (i = 0; i < count; ++i) {
7497		int ofs, len;
7498
7499		ofs = pkt->cp_rxbuf[i].rb_ofs;
7500		len = pkt->cp_rxbuf[i].rb_len;
7501		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7502			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7503			    "ofs %d, len %d\n", i, ofs, len);
7504			continue;
7505		}
7506
7507		rxr->rsc.is_last = (i == (count - 1));
7508		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7509	}
7510
7511	/*
7512	 * Ack the consumed RXBUF associated w/ this channel packet,
7513	 * so that this RXBUF can be recycled by the hypervisor.
7514	 */
7515	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7516}
7517
7518static void
7519hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7520    uint64_t tid)
7521{
7522	struct hn_nvs_rndis_ack ack;
7523	int retries, error;
7524
7525	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7526	ack.nvs_status = HN_NVS_STATUS_OK;
7527
7528	retries = 0;
7529again:
7530	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7531	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7532	if (__predict_false(error == EAGAIN)) {
7533		/*
7534		 * NOTE:
7535		 * This should _not_ happen in real world, since the
7536		 * consumption of the TX bufring from the TX path is
7537		 * controlled.
7538		 */
7539		if (rxr->hn_ack_failed == 0)
7540			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7541		rxr->hn_ack_failed++;
7542		retries++;
7543		if (retries < 10) {
7544			DELAY(100);
7545			goto again;
7546		}
7547		/* RXBUF leaks! */
7548		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7549	}
7550}
7551
7552static void
7553hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7554{
7555	struct hn_rx_ring *rxr = xrxr;
7556	struct hn_softc *sc = rxr->hn_ifp->if_softc;
7557
7558	for (;;) {
7559		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7560		int error, pktlen;
7561
7562		pktlen = rxr->hn_pktbuf_len;
7563		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7564		if (__predict_false(error == ENOBUFS)) {
7565			void *nbuf;
7566			int nlen;
7567
7568			/*
7569			 * Expand channel packet buffer.
7570			 *
7571			 * XXX
7572			 * Use M_WAITOK here, since allocation failure
7573			 * is fatal.
7574			 */
7575			nlen = rxr->hn_pktbuf_len * 2;
7576			while (nlen < pktlen)
7577				nlen *= 2;
7578			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7579
7580			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7581			    rxr->hn_pktbuf_len, nlen);
7582
7583			free(rxr->hn_pktbuf, M_DEVBUF);
7584			rxr->hn_pktbuf = nbuf;
7585			rxr->hn_pktbuf_len = nlen;
7586			/* Retry! */
7587			continue;
7588		} else if (__predict_false(error == EAGAIN)) {
7589			/* No more channel packets; done! */
7590			break;
7591		}
7592		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7593
7594		switch (pkt->cph_type) {
7595		case VMBUS_CHANPKT_TYPE_COMP:
7596			hn_nvs_handle_comp(sc, chan, pkt);
7597			break;
7598
7599		case VMBUS_CHANPKT_TYPE_RXBUF:
7600			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7601			break;
7602
7603		case VMBUS_CHANPKT_TYPE_INBAND:
7604			hn_nvs_handle_notify(sc, pkt);
7605			break;
7606
7607		default:
7608			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7609			    pkt->cph_type);
7610			break;
7611		}
7612	}
7613	hn_chan_rollup(rxr, rxr->hn_txr);
7614}
7615
7616static void
7617hn_sysinit(void *arg __unused)
7618{
7619	int i;
7620
7621	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7622
7623#ifdef HN_IFSTART_SUPPORT
7624	/*
7625	 * Don't use ifnet.if_start if transparent VF mode is requested;
7626	 * mainly due to the IFF_DRV_OACTIVE flag.
7627	 */
7628	if (hn_xpnt_vf && hn_use_if_start) {
7629		hn_use_if_start = 0;
7630		printf("hn: tranparent VF mode, if_transmit will be used, "
7631		    "instead of if_start\n");
7632	}
7633#endif
7634	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7635		printf("hn: invalid transparent VF attach routing "
7636		    "wait timeout %d, reset to %d\n",
7637		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7638		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7639	}
7640
7641	/*
7642	 * Initialize VF map.
7643	 */
7644	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7645	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7646	hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7647	    M_WAITOK | M_ZERO);
7648
7649	/*
7650	 * Fix the # of TX taskqueues.
7651	 */
7652	if (hn_tx_taskq_cnt <= 0)
7653		hn_tx_taskq_cnt = 1;
7654	else if (hn_tx_taskq_cnt > mp_ncpus)
7655		hn_tx_taskq_cnt = mp_ncpus;
7656
7657	/*
7658	 * Fix the TX taskqueue mode.
7659	 */
7660	switch (hn_tx_taskq_mode) {
7661	case HN_TX_TASKQ_M_INDEP:
7662	case HN_TX_TASKQ_M_GLOBAL:
7663	case HN_TX_TASKQ_M_EVTTQ:
7664		break;
7665	default:
7666		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7667		break;
7668	}
7669
7670	if (vm_guest != VM_GUEST_HV)
7671		return;
7672
7673	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7674		return;
7675
7676	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7677	    M_DEVBUF, M_WAITOK);
7678	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7679		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7680		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7681		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7682		    "hn tx%d", i);
7683	}
7684}
7685SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7686
7687static void
7688hn_sysuninit(void *arg __unused)
7689{
7690
7691	if (hn_tx_taskque != NULL) {
7692		int i;
7693
7694		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7695			taskqueue_free(hn_tx_taskque[i]);
7696		free(hn_tx_taskque, M_DEVBUF);
7697	}
7698
7699	if (hn_vfmap != NULL)
7700		free(hn_vfmap, M_DEVBUF);
7701	rm_destroy(&hn_vfmap_lock);
7702
7703	counter_u64_free(hn_udpcs_fixup);
7704}
7705SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7706