1/*	$NetBSD: if_hvn.c,v 1.27 2024/02/09 22:08:34 andvar Exp $	*/
2/*	$OpenBSD: if_hvn.c,v 1.39 2018/03/11 14:31:34 mikeb Exp $	*/
3
4/*-
5 * Copyright (c) 2009-2012,2016 Microsoft Corp.
6 * Copyright (c) 2010-2012 Citrix Inc.
7 * Copyright (c) 2012 NetApp Inc.
8 * Copyright (c) 2016 Mike Belopuhov <mike@esdenera.com>
9 * All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice unmodified, this list of conditions, and the following
16 *    disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/*
34 * The OpenBSD port was done under funding by Esdenera Networks GmbH.
35 */
36
37#include <sys/cdefs.h>
38__KERNEL_RCSID(0, "$NetBSD: if_hvn.c,v 1.27 2024/02/09 22:08:34 andvar Exp $");
39
40#ifdef _KERNEL_OPT
41#include "opt_if_hvn.h"
42#include "opt_inet.h"
43#include "opt_inet6.h"
44#endif
45
46#include <sys/param.h>
47#include <sys/systm.h>
48#include <sys/kernel.h>
49#include <sys/device.h>
50#include <sys/bitops.h>
51#include <sys/bus.h>
52#include <sys/condvar.h>
53#include <sys/cpu.h>
54#include <sys/evcnt.h>
55#include <sys/intr.h>
56#include <sys/kmem.h>
57#include <sys/kthread.h>
58#include <sys/mutex.h>
59#include <sys/pcq.h>
60#include <sys/sysctl.h>
61#include <sys/workqueue.h>
62
63#include <net/if.h>
64#include <net/if_ether.h>
65#include <net/if_media.h>
66#include <net/if_vlanvar.h>
67#include <net/rss_config.h>
68#include <netinet/in.h>
69#include <netinet/ip.h>
70#include <netinet/ip6.h>
71#include <netinet/udp.h>
72
73#include <net/bpf.h>
74
75#include <dev/ic/ndisreg.h>
76#include <dev/ic/rndisreg.h>
77
78#include <dev/hyperv/vmbusvar.h>
79#include <dev/hyperv/if_hvnreg.h>
80
81#ifndef EVL_PRIO_BITS
82#define EVL_PRIO_BITS	13
83#endif
84#ifndef EVL_CFI_BITS
85#define EVL_CFI_BITS	12
86#endif
87
88#define HVN_CHIM_SIZE			(15 * 1024 * 1024)
89
90#define HVN_NVS_MSGSIZE			32
91#define HVN_NVS_BUFSIZE			PAGE_SIZE
92
93#define HVN_RING_BUFSIZE		(128 * PAGE_SIZE)
94#define HVN_RING_IDX2CPU(sc, idx)	((idx) % ncpu)
95
96#ifndef HVN_CHANNEL_MAX_COUNT_DEFAULT
97#define HVN_CHANNEL_MAX_COUNT_DEFAULT	8
98#endif
99
100#ifndef HVN_LINK_STATE_CHANGE_DELAY
101#define HVN_LINK_STATE_CHANGE_DELAY	5000
102#endif
103
104#define HVN_WORKQUEUE_PRI		PRI_SOFTNET
105
106/*
107 * RNDIS control interface
108 */
109#define HVN_RNDIS_CTLREQS		4
110#define HVN_RNDIS_BUFSIZE		512
111
112struct rndis_cmd {
113	uint32_t			rc_id;
114	struct hvn_nvs_rndis		rc_msg;
115	void				*rc_req;
116	bus_dmamap_t			rc_dmap;
117	bus_dma_segment_t		rc_segs;
118	int				rc_nsegs;
119	uint64_t			rc_gpa;
120	struct rndis_packet_msg		rc_cmp;
121	uint32_t			rc_cmplen;
122	uint8_t				rc_cmpbuf[HVN_RNDIS_BUFSIZE];
123	int				rc_done;
124	TAILQ_ENTRY(rndis_cmd)		rc_entry;
125	kmutex_t			rc_lock;
126	kcondvar_t			rc_cv;
127};
128TAILQ_HEAD(rndis_queue, rndis_cmd);
129
130#define HVN_MTU_MIN			68
131#define HVN_MTU_MAX			(65535 - ETHER_ADDR_LEN)
132
133#define HVN_RNDIS_XFER_SIZE		2048
134
135#define HVN_NDIS_TXCSUM_CAP_IP4 \
136	(NDIS_TXCSUM_CAP_IP4 | NDIS_TXCSUM_CAP_IP4OPT)
137#define HVN_NDIS_TXCSUM_CAP_TCP4 \
138	(NDIS_TXCSUM_CAP_TCP4 | NDIS_TXCSUM_CAP_TCP4OPT)
139#define HVN_NDIS_TXCSUM_CAP_TCP6 \
140	(NDIS_TXCSUM_CAP_TCP6 | NDIS_TXCSUM_CAP_TCP6OPT | \
141	    NDIS_TXCSUM_CAP_IP6EXT)
142#define HVN_NDIS_TXCSUM_CAP_UDP6 \
143	(NDIS_TXCSUM_CAP_UDP6 | NDIS_TXCSUM_CAP_IP6EXT)
144#define HVN_NDIS_LSOV2_CAP_IP6 \
145	(NDIS_LSOV2_CAP_IP6EXT | NDIS_LSOV2_CAP_TCP6OPT)
146
147#define HVN_RNDIS_CMD_NORESP	__BIT(0)
148
149#define HVN_NVS_CMD_NORESP	__BIT(0)
150
151/*
152 * Tx ring
153 */
154#define HVN_TX_DESC			512
155#define HVN_TX_FRAGS			15		/* 31 is the max */
156#define HVN_TX_FRAG_SIZE		PAGE_SIZE
157#define HVN_TX_PKT_SIZE			16384
158
159#define HVN_RNDIS_PKT_LEN					\
160	(sizeof(struct rndis_packet_msg) +			\
161	 sizeof(struct rndis_pktinfo) + NDIS_VLAN_INFO_SIZE +	\
162	 sizeof(struct rndis_pktinfo) + NDIS_TXCSUM_INFO_SIZE)
163
164#define HVN_PKTSIZE_MIN(align)						\
165	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN +	\
166	HVN_RNDIS_PKT_LEN, (align))
167#define HVN_PKTSIZE(m, align)						\
168	roundup2((m)->m_pkthdr.len + HVN_RNDIS_PKT_LEN, (align))
169
170struct hvn_tx_desc {
171	uint32_t			txd_id;
172	struct vmbus_gpa		txd_sgl[HVN_TX_FRAGS + 1];
173	int				txd_nsge;
174	struct mbuf			*txd_buf;
175	bus_dmamap_t			txd_dmap;
176	struct vmbus_gpa		txd_gpa;
177	struct rndis_packet_msg		*txd_req;
178	TAILQ_ENTRY(hvn_tx_desc)	txd_entry;
179	u_int				txd_refs;
180	uint32_t			txd_flags;
181#define HVN_TXD_FLAG_ONAGG		__BIT(0)
182#define HVN_TXD_FLAG_DMAMAP		__BIT(1)
183	uint32_t			txd_chim_index;
184	int				txd_chim_size;
185	STAILQ_ENTRY(hvn_tx_desc)	txd_agg_entry;
186	STAILQ_HEAD(, hvn_tx_desc)	txd_agg_list;
187};
188
189struct hvn_softc;
190struct hvn_rx_ring;
191
192struct hvn_tx_ring {
193	struct hvn_softc		*txr_softc;
194	struct vmbus_channel		*txr_chan;
195	struct hvn_rx_ring		*txr_rxr;
196	void				*txr_si;
197	char				txr_name[16];
198
199	int				txr_id;
200	int				txr_oactive;
201	int				txr_suspended;
202	int				txr_csum_assist;
203	uint64_t			txr_caps_assist;
204	uint32_t			txr_flags;
205#define HVN_TXR_FLAG_UDP_HASH		__BIT(0)
206
207	struct evcnt			txr_evpkts;
208	struct evcnt			txr_evsends;
209	struct evcnt			txr_evnodesc;
210	struct evcnt			txr_evdmafailed;
211	struct evcnt			txr_evdefrag;
212	struct evcnt			txr_evpcqdrop;
213	struct evcnt			txr_evtransmitdefer;
214	struct evcnt			txr_evflushfailed;
215	struct evcnt			txr_evchimneytried;
216	struct evcnt			txr_evchimney;
217	struct evcnt			txr_evvlanfixup;
218	struct evcnt			txr_evvlanhwtagging;
219	struct evcnt			txr_evvlantap;
220
221	kmutex_t			txr_lock;
222	pcq_t				*txr_interq;
223
224	uint32_t			txr_avail;
225	TAILQ_HEAD(, hvn_tx_desc)	txr_list;
226	struct hvn_tx_desc		txr_desc[HVN_TX_DESC];
227	uint8_t				*txr_msgs;
228	struct hyperv_dma		txr_dma;
229
230	int				txr_chim_size;
231
232	/* Applied packet transmission aggregation limits. */
233	int				txr_agg_szmax;
234	short				txr_agg_pktmax;
235	short				txr_agg_align;
236
237	/* Packet transmission aggregation states. */
238	struct hvn_tx_desc		*txr_agg_txd;
239	int				txr_agg_szleft;
240	short				txr_agg_pktleft;
241	struct rndis_packet_msg		*txr_agg_prevpkt;
242
243	/* Temporary stats for each sends. */
244	int				txr_stat_pkts;
245	int				txr_stat_size;
246	int				txr_stat_mcasts;
247
248	int				(*txr_sendpkt)(struct hvn_tx_ring *,
249					    struct hvn_tx_desc *);
250} __aligned(CACHE_LINE_SIZE);
251
252struct hvn_rx_ring {
253	struct hvn_softc		*rxr_softc;
254	struct vmbus_channel		*rxr_chan;
255	struct hvn_tx_ring		*rxr_txr;
256	void				*rxr_si;
257	bool				rxr_workqueue;
258	char				rxr_name[16];
259
260	struct work			rxr_wk;
261	volatile bool			rxr_onlist;
262	volatile bool			rxr_onproc;
263	kmutex_t			rxr_onwork_lock;
264	kcondvar_t			rxr_onwork_cv;
265
266	uint32_t			rxr_flags;
267#define HVN_RXR_FLAG_UDP_HASH		__BIT(0)
268
269	kmutex_t			rxr_lock;
270
271	struct evcnt			rxr_evpkts;
272	struct evcnt			rxr_evcsum_ip;
273	struct evcnt			rxr_evcsum_tcp;
274	struct evcnt			rxr_evcsum_udp;
275	struct evcnt			rxr_evvlanhwtagging;
276	struct evcnt			rxr_evintr;
277	struct evcnt			rxr_evdefer;
278	struct evcnt			rxr_evdeferreq;
279	struct evcnt			rxr_evredeferreq;
280
281	/* NVS */
282	uint8_t				*rxr_nvsbuf;
283} __aligned(CACHE_LINE_SIZE);
284
285struct hvn_softc {
286	device_t			sc_dev;
287
288	struct vmbus_softc		*sc_vmbus;
289	struct vmbus_channel		*sc_prichan;
290	bus_dma_tag_t			sc_dmat;
291
292	struct ethercom			sc_ec;
293	struct ifmedia			sc_media;
294	struct if_percpuq		*sc_ipq;
295	struct workqueue		*sc_wq;
296	bool				sc_txrx_workqueue;
297	kmutex_t			sc_core_lock;
298
299	kmutex_t			sc_link_lock;
300	kcondvar_t			sc_link_cv;
301	callout_t			sc_link_tmout;
302	lwp_t				*sc_link_lwp;
303	uint32_t			sc_link_ev;
304#define HVN_LINK_EV_STATE_CHANGE	__BIT(0)
305#define HVN_LINK_EV_NETWORK_CHANGE_TMOUT __BIT(1)
306#define HVN_LINK_EV_NETWORK_CHANGE	__BIT(2)
307#define HVN_LINK_EV_RESUME_NETWORK	__BIT(3)
308#define HVN_LINK_EV_EXIT_THREAD		__BIT(4)
309	int				sc_link_state;
310	bool				sc_link_onproc;
311	bool				sc_link_pending;
312	bool				sc_link_suspend;
313
314	int				sc_tx_process_limit;
315	int				sc_rx_process_limit;
316	int				sc_tx_intr_process_limit;
317	int				sc_rx_intr_process_limit;
318
319	struct sysctllog		*sc_sysctllog;
320
321	uint32_t			sc_caps;
322#define HVN_CAPS_VLAN			__BIT(0)
323#define HVN_CAPS_MTU			__BIT(1)
324#define HVN_CAPS_IPCS			__BIT(2)
325#define HVN_CAPS_TCP4CS			__BIT(3)
326#define HVN_CAPS_TCP6CS			__BIT(4)
327#define HVN_CAPS_UDP4CS			__BIT(5)
328#define HVN_CAPS_UDP6CS			__BIT(6)
329#define HVN_CAPS_TSO4			__BIT(7)
330#define HVN_CAPS_TSO6			__BIT(8)
331#define HVN_CAPS_HASHVAL		__BIT(9)
332#define HVN_CAPS_UDPHASH		__BIT(10)
333
334	uint32_t			sc_flags;
335#define HVN_SCF_ATTACHED		__BIT(0)
336#define HVN_SCF_RXBUF_CONNECTED		__BIT(1)
337#define HVN_SCF_CHIM_CONNECTED		__BIT(2)
338#define HVN_SCF_REVOKED			__BIT(3)
339#define HVN_SCF_HAS_RSSKEY		__BIT(4)
340#define HVN_SCF_HAS_RSSIND		__BIT(5)
341
342	/* NVS protocol */
343	int				sc_proto;
344	uint32_t			sc_nvstid;
345	uint8_t				sc_nvsrsp[HVN_NVS_MSGSIZE];
346	int				sc_nvsdone;
347	kmutex_t			sc_nvsrsp_lock;
348	kcondvar_t			sc_nvsrsp_cv;
349
350	/* RNDIS protocol */
351	int				sc_ndisver;
352	uint32_t			sc_rndisrid;
353	int				sc_tso_szmax;
354	int				sc_tso_sgmin;
355	uint32_t			sc_rndis_agg_size;
356	uint32_t			sc_rndis_agg_pkts;
357	uint32_t			sc_rndis_agg_align;
358	struct rndis_queue		sc_cntl_sq; /* submission queue */
359	kmutex_t			sc_cntl_sqlck;
360	struct rndis_queue		sc_cntl_cq; /* completion queue */
361	kmutex_t			sc_cntl_cqlck;
362	struct rndis_queue		sc_cntl_fq; /* free queue */
363	kmutex_t			sc_cntl_fqlck;
364	kcondvar_t			sc_cntl_fqcv;
365	struct rndis_cmd		sc_cntl_msgs[HVN_RNDIS_CTLREQS];
366	struct hvn_nvs_rndis		sc_data_msg;
367
368	int				sc_rss_ind_size;
369	uint32_t			sc_rss_hash; /* setting, NDIS_HASH_ */
370	uint32_t			sc_rss_hcap; /* caps, NDIS_HASH_ */
371	struct ndis_rssprm_toeplitz	sc_rss;
372
373	/* Rx ring */
374	uint8_t				*sc_rx_ring;
375	int				sc_rx_size;
376	uint32_t			sc_rx_hndl;
377	struct hyperv_dma		sc_rx_dma;
378	struct hvn_rx_ring		*sc_rxr;
379	int				sc_nrxr;
380	int				sc_nrxr_inuse;
381
382	/* Tx ring */
383	struct hvn_tx_ring		*sc_txr;
384	int				sc_ntxr;
385	int				sc_ntxr_inuse;
386
387	/* chimney sending buffers */
388	uint8_t				*sc_chim;
389	uint32_t			sc_chim_hndl;
390	struct hyperv_dma		sc_chim_dma;
391	kmutex_t			sc_chim_bmap_lock;
392	u_long				*sc_chim_bmap;
393	int				sc_chim_bmap_cnt;
394	int				sc_chim_cnt;
395	int				sc_chim_szmax;
396
397	/* Packet transmission aggregation user settings. */
398	int				sc_agg_size;
399	int				sc_agg_pkts;
400};
401
402#define SC2IFP(_sc_)	(&(_sc_)->sc_ec.ec_if)
403#define IFP2SC(_ifp_)	((_ifp_)->if_softc)
404
405#ifndef HVN_TX_PROCESS_LIMIT_DEFAULT
406#define HVN_TX_PROCESS_LIMIT_DEFAULT		128
407#endif
408#ifndef HVN_RX_PROCESS_LIMIT_DEFAULT
409#define HVN_RX_PROCESS_LIMIT_DEFAULT		128
410#endif
411#ifndef HVN_TX_INTR_PROCESS_LIMIT_DEFAULT
412#define HVN_TX_INTR_PROCESS_LIMIT_DEFAULT	256
413#endif
414#ifndef HVN_RX_INTR_PROCESS_LIMIT_DEFAULT
415#define HVN_RX_INTR_PROCESS_LIMIT_DEFAULT	256
416#endif
417
418/*
419 * See hvn_set_hlen().
420 *
421 * This value is for Azure.  For Hyper-V, set this above
422 * 65536 to disable UDP datagram checksum fixup.
423 */
424#ifndef HVN_UDP_CKSUM_FIXUP_MTU_DEFAULT
425#define HVN_UDP_CKSUM_FIXUP_MTU_DEFAULT	1420
426#endif
427static int hvn_udpcs_fixup_mtu = HVN_UDP_CKSUM_FIXUP_MTU_DEFAULT;
428
429/* Limit chimney send size */
430static int hvn_tx_chimney_size = 0;
431
432/* # of channels to use; each channel has one RX ring and one TX ring */
433#ifndef HVN_CHANNEL_COUNT_DEFAULT
434#define HVN_CHANNEL_COUNT_DEFAULT	0
435#endif
436static int hvn_channel_cnt = HVN_CHANNEL_COUNT_DEFAULT;
437
438/* # of transmit rings to use */
439#ifndef HVN_TX_RING_COUNT_DEFAULT
440#define HVN_TX_RING_COUNT_DEFAULT	0
441#endif
442static int hvn_tx_ring_cnt = HVN_TX_RING_COUNT_DEFAULT;
443
444/* Packet transmission aggregation size limit */
445static int hvn_tx_agg_size = -1;
446
447/* Packet transmission aggregation count limit */
448static int hvn_tx_agg_pkts = -1;
449
450static int	hvn_match(device_t, cfdata_t, void *);
451static void	hvn_attach(device_t, device_t, void *);
452static int	hvn_detach(device_t, int);
453
454CFATTACH_DECL_NEW(hvn, sizeof(struct hvn_softc),
455    hvn_match, hvn_attach, hvn_detach, NULL);
456
457static int	hvn_ioctl(struct ifnet *, u_long, void *);
458static int	hvn_media_change(struct ifnet *);
459static void	hvn_media_status(struct ifnet *, struct ifmediareq *);
460static void	hvn_link_task(void *);
461static void	hvn_link_event(struct hvn_softc *, uint32_t);
462static void	hvn_link_netchg_tmout_cb(void *);
463static int	hvn_init(struct ifnet *);
464static int	hvn_init_locked(struct ifnet *);
465static void	hvn_stop(struct ifnet *, int);
466static void	hvn_stop_locked(struct ifnet *);
467static void	hvn_start(struct ifnet *);
468static int	hvn_transmit(struct ifnet *, struct mbuf *);
469static void	hvn_deferred_transmit(void *);
470static int	hvn_flush_txagg(struct hvn_tx_ring *);
471static int	hvn_encap(struct hvn_tx_ring *, struct hvn_tx_desc *,
472		    struct mbuf *, int);
473static int	hvn_txpkt(struct hvn_tx_ring *, struct hvn_tx_desc *);
474static void	hvn_txeof(struct hvn_tx_ring *, uint64_t);
475static int	hvn_rx_ring_create(struct hvn_softc *, int);
476static int	hvn_rx_ring_destroy(struct hvn_softc *);
477static void	hvn_fixup_rx_data(struct hvn_softc *);
478static int	hvn_tx_ring_create(struct hvn_softc *, int);
479static void	hvn_tx_ring_destroy(struct hvn_softc *);
480static void	hvn_set_chim_size(struct hvn_softc *, int);
481static uint32_t	hvn_chim_alloc(struct hvn_softc *);
482static void	hvn_chim_free(struct hvn_softc *, uint32_t);
483static void	hvn_fixup_tx_data(struct hvn_softc *);
484static struct mbuf *
485		hvn_set_hlen(struct mbuf *, int *);
486static int	hvn_txd_peek(struct hvn_tx_ring *);
487static struct hvn_tx_desc *
488		hvn_txd_get(struct hvn_tx_ring *);
489static void	hvn_txd_put(struct hvn_tx_ring *, struct hvn_tx_desc *);
490static void	hvn_txd_gc(struct hvn_tx_ring *, struct hvn_tx_desc *);
491static void	hvn_txd_hold(struct hvn_tx_desc *);
492static void	hvn_txd_agg(struct hvn_tx_desc *, struct hvn_tx_desc *);
493static int	hvn_tx_ring_pending(struct hvn_tx_ring *);
494static void	hvn_tx_ring_qflush(struct hvn_softc *, struct hvn_tx_ring *);
495static int	hvn_get_rsscaps(struct hvn_softc *, int *);
496static int	hvn_set_rss(struct hvn_softc *, uint16_t);
497static void	hvn_fixup_rss_ind(struct hvn_softc *);
498static int	hvn_get_hwcaps(struct hvn_softc *, struct ndis_offload *);
499static int	hvn_set_capabilities(struct hvn_softc *, int);
500static int	hvn_get_lladdr(struct hvn_softc *, uint8_t *);
501static void	hvn_update_link_status(struct hvn_softc *);
502static int	hvn_get_mtu(struct hvn_softc *, uint32_t *);
503static int	hvn_channel_attach(struct hvn_softc *, struct vmbus_channel *);
504static void	hvn_channel_detach(struct hvn_softc *, struct vmbus_channel *);
505static void	hvn_channel_detach_all(struct hvn_softc *);
506static int	hvn_subchannel_attach(struct hvn_softc *);
507static int	hvn_synth_alloc_subchannels(struct hvn_softc *, int *);
508static int	hvn_synth_attachable(const struct hvn_softc *);
509static int	hvn_synth_attach(struct hvn_softc *, int);
510static void	hvn_synth_detach(struct hvn_softc *);
511static void	hvn_set_ring_inuse(struct hvn_softc *, int);
512static void	hvn_disable_rx(struct hvn_softc *);
513static void	hvn_drain_rxtx(struct hvn_softc *, int );
514static void	hvn_suspend_data(struct hvn_softc *);
515static void	hvn_suspend_mgmt(struct hvn_softc *);
516static void	hvn_suspend(struct hvn_softc *) __unused;
517static void	hvn_resume_tx(struct hvn_softc *, int);
518static void	hvn_resume_data(struct hvn_softc *);
519static void	hvn_resume_mgmt(struct hvn_softc *);
520static void	hvn_resume(struct hvn_softc *) __unused;
521static void	hvn_init_sysctls(struct hvn_softc *);
522
523/* NSVP */
524static int	hvn_nvs_init(struct hvn_softc *);
525static void	hvn_nvs_destroy(struct hvn_softc *);
526static int	hvn_nvs_attach(struct hvn_softc *, int);
527static int	hvn_nvs_connect_rxbuf(struct hvn_softc *);
528static int	hvn_nvs_disconnect_rxbuf(struct hvn_softc *);
529static int	hvn_nvs_connect_chim(struct hvn_softc *);
530static int	hvn_nvs_disconnect_chim(struct hvn_softc *);
531static void	hvn_handle_ring_work(struct work *, void *);
532static void	hvn_nvs_softintr(void *);
533static void	hvn_nvs_intr(void *);
534static void	hvn_nvs_intr1(struct hvn_rx_ring *, int, int);
535static int	hvn_nvs_cmd(struct hvn_softc *, void *, size_t, uint64_t,
536		    u_int);
537static int	hvn_nvs_ack(struct hvn_rx_ring *, uint64_t);
538static void	hvn_nvs_detach(struct hvn_softc *);
539static int	hvn_nvs_alloc_subchannels(struct hvn_softc *, int *);
540
541/* RNDIS */
542static int	hvn_rndis_init(struct hvn_softc *);
543static void	hvn_rndis_destroy(struct hvn_softc *);
544static int	hvn_rndis_attach(struct hvn_softc *, int);
545static int	hvn_rndis_cmd(struct hvn_softc *, struct rndis_cmd *, u_int);
546static int	hvn_rndis_input(struct hvn_rx_ring *, uint64_t, void *);
547static int	hvn_rxeof(struct hvn_rx_ring *, uint8_t *, uint32_t);
548static void	hvn_rndis_complete(struct hvn_softc *, uint8_t *, uint32_t);
549static int	hvn_rndis_output_sgl(struct hvn_tx_ring *,
550		    struct hvn_tx_desc *);
551static int	hvn_rndis_output_chim(struct hvn_tx_ring *,
552		    struct hvn_tx_desc *);
553static void	hvn_rndis_status(struct hvn_softc *, uint8_t *, uint32_t);
554static int	hvn_rndis_query(struct hvn_softc *, uint32_t, void *, size_t *);
555static int	hvn_rndis_query2(struct hvn_softc *, uint32_t, const void *,
556		    size_t, void *, size_t *, size_t);
557static int	hvn_rndis_set(struct hvn_softc *, uint32_t, void *, size_t);
558static int	hvn_rndis_open(struct hvn_softc *);
559static int	hvn_rndis_close(struct hvn_softc *);
560static void	hvn_rndis_detach(struct hvn_softc *);
561
562static int
563hvn_match(device_t parent, cfdata_t match, void *aux)
564{
565	struct vmbus_attach_args *aa = aux;
566
567	if (memcmp(aa->aa_type, &hyperv_guid_network, sizeof(*aa->aa_type)))
568		return 0;
569	return 1;
570}
571
572static void
573hvn_attach(device_t parent, device_t self, void *aux)
574{
575	struct hvn_softc *sc = device_private(self);
576	struct vmbus_attach_args *aa = aux;
577	struct ifnet *ifp = SC2IFP(sc);
578	char xnamebuf[32];
579	uint8_t enaddr[ETHER_ADDR_LEN];
580	uint32_t mtu;
581	int tx_ring_cnt, ring_cnt;
582	int error;
583
584	sc->sc_dev = self;
585	sc->sc_vmbus = (struct vmbus_softc *)device_private(parent);
586	sc->sc_prichan = aa->aa_chan;
587	sc->sc_dmat = sc->sc_vmbus->sc_dmat;
588
589	aprint_naive("\n");
590	aprint_normal(": Hyper-V NetVSC\n");
591
592	sc->sc_txrx_workqueue = true;
593	sc->sc_tx_process_limit = HVN_TX_PROCESS_LIMIT_DEFAULT;
594	sc->sc_rx_process_limit = HVN_RX_PROCESS_LIMIT_DEFAULT;
595	sc->sc_tx_intr_process_limit = HVN_TX_INTR_PROCESS_LIMIT_DEFAULT;
596	sc->sc_rx_intr_process_limit = HVN_RX_INTR_PROCESS_LIMIT_DEFAULT;
597	sc->sc_agg_size = hvn_tx_agg_size;
598	sc->sc_agg_pkts = hvn_tx_agg_pkts;
599
600	mutex_init(&sc->sc_core_lock, MUTEX_DEFAULT, IPL_SOFTNET);
601	mutex_init(&sc->sc_link_lock, MUTEX_DEFAULT, IPL_NET);
602	cv_init(&sc->sc_link_cv, "hvnknkcv");
603	callout_init(&sc->sc_link_tmout, CALLOUT_MPSAFE);
604	callout_setfunc(&sc->sc_link_tmout, hvn_link_netchg_tmout_cb, sc);
605	if (kthread_create(PRI_NONE, KTHREAD_MUSTJOIN | KTHREAD_MPSAFE, NULL,
606	    hvn_link_task, sc, &sc->sc_link_lwp, "%slink",
607	    device_xname(self))) {
608		aprint_error_dev(self, "failed to create link thread\n");
609		return;
610	}
611
612	snprintf(xnamebuf, sizeof(xnamebuf), "%srxtx", device_xname(self));
613	if (workqueue_create(&sc->sc_wq, xnamebuf, hvn_handle_ring_work,
614	    sc, HVN_WORKQUEUE_PRI, IPL_NET, WQ_PERCPU | WQ_MPSAFE)) {
615		aprint_error_dev(self, "failed to create workqueue\n");
616		sc->sc_wq = NULL;
617		goto destroy_link_thread;
618	}
619
620	ring_cnt = hvn_channel_cnt;
621	if (ring_cnt <= 0) {
622		ring_cnt = ncpu;
623		if (ring_cnt > HVN_CHANNEL_MAX_COUNT_DEFAULT)
624			ring_cnt = HVN_CHANNEL_MAX_COUNT_DEFAULT;
625	} else if (ring_cnt > ncpu)
626		ring_cnt = ncpu;
627
628	tx_ring_cnt = hvn_tx_ring_cnt;
629	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
630		tx_ring_cnt = ring_cnt;
631
632	if (hvn_tx_ring_create(sc, tx_ring_cnt)) {
633		aprint_error_dev(self, "failed to create Tx ring\n");
634		goto destroy_wq;
635	}
636
637	if (hvn_rx_ring_create(sc, ring_cnt)) {
638		aprint_error_dev(self, "failed to create Rx ring\n");
639		goto destroy_tx_ring;
640	}
641
642	strlcpy(ifp->if_xname, device_xname(sc->sc_dev), IFNAMSIZ);
643	ifp->if_softc = sc;
644	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
645	ifp->if_extflags = IFEF_MPSAFE;
646	ifp->if_ioctl = hvn_ioctl;
647	ifp->if_start = hvn_start;
648	ifp->if_transmit = hvn_transmit;
649	ifp->if_init = hvn_init;
650	ifp->if_stop = hvn_stop;
651	ifp->if_baudrate = IF_Gbps(10);
652
653	IFQ_SET_MAXLEN(&ifp->if_snd, uimax(HVN_TX_DESC - 1, IFQ_MAXLEN));
654	IFQ_SET_READY(&ifp->if_snd);
655
656	/* Initialize ifmedia structures. */
657	sc->sc_ec.ec_ifmedia = &sc->sc_media;
658	ifmedia_init_with_lock(&sc->sc_media, IFM_IMASK,
659	    hvn_media_change, hvn_media_status, &sc->sc_core_lock);
660	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
661	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10G_T | IFM_FDX, 0, NULL);
662	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10G_T, 0, NULL);
663	ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
664
665	if_initialize(ifp);
666	sc->sc_ipq = if_percpuq_create(ifp);
667	if_deferred_start_init(ifp, NULL);
668
669	hvn_nvs_init(sc);
670	hvn_rndis_init(sc);
671	if (hvn_synth_attach(sc, ETHERMTU)) {
672		aprint_error_dev(self, "failed to attach synth\n");
673		goto destroy_if_percpuq;
674	}
675
676	aprint_normal_dev(self, "NVS %d.%d NDIS %d.%d\n",
677	    sc->sc_proto >> 16, sc->sc_proto & 0xffff,
678	    sc->sc_ndisver >> 16 , sc->sc_ndisver & 0xffff);
679
680	if (hvn_get_lladdr(sc, enaddr)) {
681		aprint_error_dev(self,
682		    "failed to obtain an ethernet address\n");
683		goto detach_synth;
684	}
685	aprint_normal_dev(self, "Ethernet address %s\n", ether_sprintf(enaddr));
686
687	/*
688	 * Fixup TX/RX stuffs after synthetic parts are attached.
689	 */
690	hvn_fixup_tx_data(sc);
691	hvn_fixup_rx_data(sc);
692
693	ifp->if_capabilities |= sc->sc_txr[0].txr_caps_assist &
694		(IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_IPv4_Rx |
695		 IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv4_Rx |
696		 IFCAP_CSUM_TCPv6_Tx | IFCAP_CSUM_TCPv6_Rx |
697		 IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv4_Rx |
698		 IFCAP_CSUM_UDPv6_Tx | IFCAP_CSUM_UDPv6_Rx);
699	/* XXX TSOv4, TSOv6 */
700	if (sc->sc_caps & HVN_CAPS_VLAN) {
701		/* XXX not sure about VLAN_MTU. */
702		sc->sc_ec.ec_capabilities |= ETHERCAP_VLAN_HWTAGGING;
703		sc->sc_ec.ec_capabilities |= ETHERCAP_VLAN_MTU;
704	}
705	sc->sc_ec.ec_capabilities |= ETHERCAP_JUMBO_MTU;
706
707	ether_ifattach(ifp, enaddr);
708
709	error = hvn_get_mtu(sc, &mtu);
710	if (error)
711		mtu = ETHERMTU;
712	if (mtu < ETHERMTU) {
713		DPRINTF("%s: fixup mtu %u -> %u\n", device_xname(sc->sc_dev),
714		    ETHERMTU, mtu);
715		ifp->if_mtu = mtu;
716	}
717
718	if_register(ifp);
719
720	/*
721	 * Kick off link status check.
722	 */
723	hvn_link_event(sc, HVN_LINK_EV_STATE_CHANGE);
724
725	hvn_init_sysctls(sc);
726
727	if (pmf_device_register(self, NULL, NULL))
728		pmf_class_network_register(self, ifp);
729	else
730		aprint_error_dev(self, "couldn't establish power handler\n");
731
732	SET(sc->sc_flags, HVN_SCF_ATTACHED);
733	return;
734
735detach_synth:
736	hvn_synth_detach(sc);
737	hvn_rndis_destroy(sc);
738	hvn_nvs_destroy(sc);
739destroy_if_percpuq:
740	if_percpuq_destroy(sc->sc_ipq);
741	hvn_rx_ring_destroy(sc);
742destroy_tx_ring:
743	hvn_tx_ring_destroy(sc);
744destroy_wq:
745	workqueue_destroy(sc->sc_wq);
746	sc->sc_wq = NULL;
747destroy_link_thread:
748	hvn_link_event(sc, HVN_LINK_EV_EXIT_THREAD);
749	kthread_join(sc->sc_link_lwp);
750	callout_destroy(&sc->sc_link_tmout);
751	cv_destroy(&sc->sc_link_cv);
752	mutex_destroy(&sc->sc_link_lock);
753	mutex_destroy(&sc->sc_core_lock);
754}
755
756static int
757hvn_detach(device_t self, int flags)
758{
759	struct hvn_softc *sc = device_private(self);
760	struct ifnet *ifp = SC2IFP(sc);
761
762	if (!ISSET(sc->sc_flags, HVN_SCF_ATTACHED))
763		return 0;
764
765	if (vmbus_channel_is_revoked(sc->sc_prichan))
766		SET(sc->sc_flags, HVN_SCF_REVOKED);
767
768	pmf_device_deregister(self);
769
770	mutex_enter(&sc->sc_core_lock);
771
772	if (ifp->if_flags & IFF_RUNNING)
773		hvn_stop_locked(ifp);
774	/*
775	 * NOTE:
776	 * hvn_stop() only suspends data, so management
777	 * stuffs have to be suspended manually here.
778	 */
779	hvn_suspend_mgmt(sc);
780
781	ether_ifdetach(ifp);
782	if_detach(ifp);
783	if_percpuq_destroy(sc->sc_ipq);
784
785	hvn_link_event(sc, HVN_LINK_EV_EXIT_THREAD);
786	kthread_join(sc->sc_link_lwp);
787	callout_halt(&sc->sc_link_tmout, NULL);
788
789	hvn_synth_detach(sc);
790	hvn_rndis_destroy(sc);
791	hvn_nvs_destroy(sc);
792
793	mutex_exit(&sc->sc_core_lock);
794
795	hvn_rx_ring_destroy(sc);
796	hvn_tx_ring_destroy(sc);
797
798	workqueue_destroy(sc->sc_wq);
799	callout_destroy(&sc->sc_link_tmout);
800	cv_destroy(&sc->sc_link_cv);
801	mutex_destroy(&sc->sc_link_lock);
802	mutex_destroy(&sc->sc_core_lock);
803
804	sysctl_teardown(&sc->sc_sysctllog);
805
806	return 0;
807}
808
809static int
810hvn_ioctl(struct ifnet *ifp, u_long command, void * data)
811{
812	struct hvn_softc *sc = IFP2SC(ifp);
813	struct ifreq *ifr = (struct ifreq *)data;
814	uint32_t mtu;
815	int s, error = 0;
816
817	switch (command) {
818	case SIOCSIFMTU:
819		if (ifr->ifr_mtu < HVN_MTU_MIN || ifr->ifr_mtu > HVN_MTU_MAX) {
820			error = EINVAL;
821			break;
822		}
823
824		mutex_enter(&sc->sc_core_lock);
825
826		if (!(sc->sc_caps & HVN_CAPS_MTU)) {
827			/* Can't change MTU */
828			mutex_exit(&sc->sc_core_lock);
829			error = EOPNOTSUPP;
830			break;
831		}
832
833		if (ifp->if_mtu == ifr->ifr_mtu) {
834			mutex_exit(&sc->sc_core_lock);
835			break;
836		}
837
838		/*
839		 * Suspend this interface before the synthetic parts
840		 * are ripped.
841		 */
842		hvn_suspend(sc);
843
844		/*
845		 * Detach the synthetics parts, i.e. NVS and RNDIS.
846		 */
847		hvn_synth_detach(sc);
848
849		/*
850		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
851		 * with the new MTU setting.
852		 */
853		error = hvn_synth_attach(sc, ifr->ifr_mtu);
854		if (error) {
855			mutex_exit(&sc->sc_core_lock);
856			break;
857		}
858
859		error = hvn_get_mtu(sc, &mtu);
860		if (error)
861			mtu = ifr->ifr_mtu;
862		DPRINTF("%s: RNDIS mtu=%d\n", device_xname(sc->sc_dev), mtu);
863
864		/*
865		 * Commit the requested MTU, after the synthetic parts
866		 * have been successfully attached.
867		 */
868		if (mtu >= ifr->ifr_mtu) {
869			mtu = ifr->ifr_mtu;
870		} else {
871			DPRINTF("%s: fixup mtu %d -> %u\n",
872			    device_xname(sc->sc_dev), ifr->ifr_mtu, mtu);
873		}
874		ifp->if_mtu = mtu;
875
876		/*
877		 * Synthetic parts' reattach may change the chimney
878		 * sending size; update it.
879		 */
880		if (sc->sc_txr[0].txr_chim_size > sc->sc_chim_szmax)
881			hvn_set_chim_size(sc, sc->sc_chim_szmax);
882
883		/*
884		 * All done!  Resume the interface now.
885		 */
886		hvn_resume(sc);
887
888		mutex_exit(&sc->sc_core_lock);
889		break;
890	default:
891		s = splnet();
892		if (command == SIOCGIFMEDIA || command == SIOCSIFMEDIA)
893			error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, command);
894		else
895			error = ether_ioctl(ifp, command, data);
896		splx(s);
897		if (error == ENETRESET) {
898			mutex_enter(&sc->sc_core_lock);
899			if (ifp->if_flags & IFF_RUNNING)
900				hvn_init_locked(ifp);
901			mutex_exit(&sc->sc_core_lock);
902			error = 0;
903		}
904		break;
905	}
906
907	return error;
908}
909
910static int
911hvn_media_change(struct ifnet *ifp)
912{
913	struct hvn_softc *sc = IFP2SC(ifp);
914	struct ifmedia *ifm = &sc->sc_media;
915
916	if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
917		return EINVAL;
918
919	switch (IFM_SUBTYPE(ifm->ifm_media)) {
920	case IFM_AUTO:
921		break;
922	default:
923		device_printf(sc->sc_dev, "Only auto media type\n");
924		return EINVAL;
925	}
926	return 0;
927}
928
929static void
930hvn_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
931{
932	struct hvn_softc *sc = IFP2SC(ifp);
933
934	ifmr->ifm_status = IFM_AVALID;
935	ifmr->ifm_active = IFM_ETHER;
936
937	if (sc->sc_link_state != LINK_STATE_UP) {
938		ifmr->ifm_active |= IFM_NONE;
939		return;
940	}
941
942	ifmr->ifm_status |= IFM_ACTIVE;
943	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
944}
945
946static void
947hvn_link_task(void *arg)
948{
949	struct hvn_softc *sc = arg;
950	struct ifnet *ifp = SC2IFP(sc);
951	uint32_t event;
952	int old_link_state;
953
954	mutex_enter(&sc->sc_link_lock);
955	sc->sc_link_onproc = false;
956	for (;;) {
957		if (sc->sc_link_ev == 0) {
958			cv_wait(&sc->sc_link_cv, &sc->sc_link_lock);
959			continue;
960		}
961
962		sc->sc_link_onproc = true;
963		event = sc->sc_link_ev;
964		sc->sc_link_ev = 0;
965		mutex_exit(&sc->sc_link_lock);
966
967		if (event & HVN_LINK_EV_EXIT_THREAD)
968			break;
969
970		if (sc->sc_link_suspend)
971			goto next;
972
973		if (event & HVN_LINK_EV_RESUME_NETWORK) {
974			if (sc->sc_link_pending)
975				event |= HVN_LINK_EV_NETWORK_CHANGE;
976			else
977				event |= HVN_LINK_EV_STATE_CHANGE;
978		}
979
980		if (event & HVN_LINK_EV_NETWORK_CHANGE) {
981			/* Prevent any link status checks from running. */
982			sc->sc_link_pending = true;
983
984			/*
985			 * Fake up a [link down --> link up] state change;
986			 * 5 seconds delay is used, which closely simulates
987			 * miibus reaction upon link down event.
988			 */
989			old_link_state = sc->sc_link_state;
990			sc->sc_link_state = LINK_STATE_DOWN;
991			if (old_link_state != sc->sc_link_state) {
992				if_link_state_change(ifp, LINK_STATE_DOWN);
993			}
994#if defined(HVN_LINK_STATE_CHANGE_DELAY) && HVN_LINK_STATE_CHANGE_DELAY > 0
995			callout_schedule(&sc->sc_link_tmout,
996			    mstohz(HVN_LINK_STATE_CHANGE_DELAY));
997#else
998			hvn_link_event(sc, HVN_LINK_EV_NETWORK_CHANGE_TMOUT);
999#endif
1000		} else if (event & HVN_LINK_EV_NETWORK_CHANGE_TMOUT) {
1001			/* Re-allow link status checks. */
1002			sc->sc_link_pending = false;
1003			hvn_update_link_status(sc);
1004		} else if (event & HVN_LINK_EV_STATE_CHANGE) {
1005			if (!sc->sc_link_pending)
1006				hvn_update_link_status(sc);
1007		}
1008 next:
1009		mutex_enter(&sc->sc_link_lock);
1010		sc->sc_link_onproc = false;
1011	}
1012
1013	mutex_enter(&sc->sc_link_lock);
1014	sc->sc_link_onproc = false;
1015	mutex_exit(&sc->sc_link_lock);
1016
1017	kthread_exit(0);
1018}
1019
1020static void
1021hvn_link_event(struct hvn_softc *sc, uint32_t ev)
1022{
1023
1024	mutex_enter(&sc->sc_link_lock);
1025	SET(sc->sc_link_ev, ev);
1026	cv_signal(&sc->sc_link_cv);
1027	mutex_exit(&sc->sc_link_lock);
1028}
1029
1030static void
1031hvn_link_netchg_tmout_cb(void *arg)
1032{
1033	struct hvn_softc *sc = arg;
1034
1035	hvn_link_event(sc, HVN_LINK_EV_NETWORK_CHANGE_TMOUT);
1036}
1037
1038static int
1039hvn_init(struct ifnet *ifp)
1040{
1041	struct hvn_softc *sc = IFP2SC(ifp);
1042	int error;
1043
1044	mutex_enter(&sc->sc_core_lock);
1045	error = hvn_init_locked(ifp);
1046	mutex_exit(&sc->sc_core_lock);
1047
1048	return error;
1049}
1050
1051static int
1052hvn_init_locked(struct ifnet *ifp)
1053{
1054	struct hvn_softc *sc = IFP2SC(ifp);
1055	int error;
1056
1057	KASSERT(mutex_owned(&sc->sc_core_lock));
1058
1059	hvn_stop_locked(ifp);
1060
1061	error = hvn_rndis_open(sc);
1062	if (error)
1063		return error;
1064
1065	/* Clear TX 'suspended' bit. */
1066	hvn_resume_tx(sc, sc->sc_ntxr_inuse);
1067
1068	/* Everything is ready; unleash! */
1069	ifp->if_flags |= IFF_RUNNING;
1070
1071	return 0;
1072}
1073
1074static void
1075hvn_stop(struct ifnet *ifp, int disable)
1076{
1077	struct hvn_softc *sc = IFP2SC(ifp);
1078
1079	mutex_enter(&sc->sc_core_lock);
1080	hvn_stop_locked(ifp);
1081	mutex_exit(&sc->sc_core_lock);
1082}
1083
1084static void
1085hvn_stop_locked(struct ifnet *ifp)
1086{
1087	struct hvn_softc *sc = IFP2SC(ifp);
1088	int i;
1089
1090	KASSERT(mutex_owned(&sc->sc_core_lock));
1091
1092	/* Clear RUNNING bit ASAP. */
1093	ifp->if_flags &= ~IFF_RUNNING;
1094
1095	/* Suspend data transfers. */
1096	hvn_suspend_data(sc);
1097
1098	/* Clear OACTIVE state. */
1099	for (i = 0; i < sc->sc_ntxr_inuse; i++)
1100		sc->sc_txr[i].txr_oactive = 0;
1101}
1102
1103static void
1104hvn_transmit_common(struct ifnet *ifp, struct hvn_tx_ring *txr,
1105    bool is_transmit)
1106{
1107	struct hvn_tx_desc *txd;
1108	struct mbuf *m;
1109	int l2hlen = ETHER_HDR_LEN;
1110
1111	KASSERT(mutex_owned(&txr->txr_lock));
1112
1113	if (!(ifp->if_flags & IFF_RUNNING))
1114		return;
1115	if (txr->txr_oactive)
1116		return;
1117	if (txr->txr_suspended)
1118		return;
1119
1120	for (;;) {
1121		if (!hvn_txd_peek(txr)) {
1122			/* transient */
1123			txr->txr_oactive = 1;
1124			txr->txr_evnodesc.ev_count++;
1125			break;
1126		}
1127
1128		if (is_transmit)
1129			m = pcq_get(txr->txr_interq);
1130		else
1131			IFQ_DEQUEUE(&ifp->if_snd, m);
1132		if (m == NULL)
1133			break;
1134
1135#if defined(INET) || defined(INET6)
1136		if (m->m_pkthdr.csum_flags &
1137		    (M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_TCPv6|M_CSUM_UDPv6)) {
1138			m = hvn_set_hlen(m, &l2hlen);
1139			if (__predict_false(m == NULL)) {
1140				if_statinc(ifp, if_oerrors);
1141				continue;
1142			}
1143		}
1144#endif
1145
1146		txd = hvn_txd_get(txr);
1147		if (hvn_encap(txr, txd, m, l2hlen)) {
1148			/* the chain is too large */
1149			if_statinc(ifp, if_oerrors);
1150			hvn_txd_put(txr, txd);
1151			m_freem(m);
1152			continue;
1153		}
1154
1155		if (txr->txr_agg_pktleft == 0) {
1156			if (txr->txr_agg_txd != NULL) {
1157				hvn_flush_txagg(txr);
1158			} else {
1159				if (hvn_txpkt(txr, txd)) {
1160					/* txd is freed, but m is not. */
1161					m_freem(m);
1162					if_statinc(ifp, if_oerrors);
1163				}
1164			}
1165		}
1166	}
1167
1168	/* Flush pending aggerated transmission. */
1169	if (txr->txr_agg_txd != NULL)
1170		hvn_flush_txagg(txr);
1171}
1172
1173static void
1174hvn_start(struct ifnet *ifp)
1175{
1176	struct hvn_softc *sc = IFP2SC(ifp);
1177	struct hvn_tx_ring *txr = &sc->sc_txr[0];
1178
1179	mutex_enter(&txr->txr_lock);
1180	hvn_transmit_common(ifp, txr, false);
1181	mutex_exit(&txr->txr_lock);
1182}
1183
1184static int
1185hvn_select_txqueue(struct ifnet *ifp, struct mbuf *m __unused)
1186{
1187	struct hvn_softc *sc = IFP2SC(ifp);
1188	u_int cpu;
1189
1190	cpu = cpu_index(curcpu());
1191
1192	return cpu % sc->sc_ntxr_inuse;
1193}
1194
1195static int
1196hvn_transmit(struct ifnet *ifp, struct mbuf *m)
1197{
1198	struct hvn_softc *sc = IFP2SC(ifp);
1199	struct hvn_tx_ring *txr;
1200	int qid;
1201
1202	qid = hvn_select_txqueue(ifp, m);
1203	txr = &sc->sc_txr[qid];
1204
1205	if (__predict_false(!pcq_put(txr->txr_interq, m))) {
1206		mutex_enter(&txr->txr_lock);
1207		txr->txr_evpcqdrop.ev_count++;
1208		mutex_exit(&txr->txr_lock);
1209		m_freem(m);
1210		return ENOBUFS;
1211	}
1212
1213	kpreempt_disable();
1214	softint_schedule(txr->txr_si);
1215	kpreempt_enable();
1216	return 0;
1217}
1218
1219static void
1220hvn_deferred_transmit(void *arg)
1221{
1222	struct hvn_tx_ring *txr = arg;
1223	struct hvn_softc *sc = txr->txr_softc;
1224	struct ifnet *ifp = SC2IFP(sc);
1225
1226	mutex_enter(&txr->txr_lock);
1227	txr->txr_evtransmitdefer.ev_count++;
1228	hvn_transmit_common(ifp, txr, true);
1229	mutex_exit(&txr->txr_lock);
1230}
1231
1232static inline char *
1233hvn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1234    size_t datalen, uint32_t type)
1235{
1236	struct rndis_pktinfo *pi;
1237	size_t pi_size = sizeof(*pi) + datalen;
1238	char *cp;
1239
1240	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <=
1241	    pktsize);
1242
1243	cp = (char *)pkt + pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1244	pi = (struct rndis_pktinfo *)cp;
1245	pi->rm_size = pi_size;
1246	pi->rm_type = type;
1247	pi->rm_pktinfooffset = sizeof(*pi);
1248	pkt->rm_pktinfolen += pi_size;
1249	pkt->rm_dataoffset += pi_size;
1250	pkt->rm_len += pi_size;
1251
1252	return (char *)pi->rm_data;
1253}
1254
1255static struct mbuf *
1256hvn_pullup_hdr(struct mbuf *m, int len)
1257{
1258	struct mbuf *mn;
1259
1260	if (__predict_false(m->m_len < len)) {
1261		mn = m_pullup(m, len);
1262		if (mn == NULL)
1263			return NULL;
1264		m = mn;
1265	}
1266	return m;
1267}
1268
1269/*
1270 * NOTE: If this function failed, the m would be freed.
1271 */
1272static struct mbuf *
1273hvn_set_hlen(struct mbuf *m, int *l2hlenp)
1274{
1275	const struct ether_header *eh;
1276	int l2hlen, off;
1277
1278	m = hvn_pullup_hdr(m, sizeof(*eh));
1279	if (m == NULL)
1280		return NULL;
1281
1282	eh = mtod(m, const struct ether_header *);
1283	if (eh->ether_type == ntohs(ETHERTYPE_VLAN))
1284		l2hlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1285	else
1286		l2hlen = ETHER_HDR_LEN;
1287
1288#if defined(INET)
1289	if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4 | M_CSUM_UDPv4)) {
1290		const struct ip *ip;
1291
1292		off = l2hlen + sizeof(*ip);
1293		m = hvn_pullup_hdr(m, off);
1294		if (m == NULL)
1295			return NULL;
1296
1297		ip = (struct ip *)((mtod(m, uint8_t *)) + off);
1298
1299		/*
1300		 * UDP checksum offload does not work in Azure, if the
1301		 * following conditions meet:
1302		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
1303		 * - IP_DF is not set in the IP hdr.
1304		 *
1305		 * Fallback to software checksum for these UDP datagrams.
1306		 */
1307		if ((m->m_pkthdr.csum_flags & M_CSUM_UDPv4) &&
1308		    m->m_pkthdr.len > hvn_udpcs_fixup_mtu + l2hlen &&
1309		    !(ntohs(ip->ip_off) & IP_DF)) {
1310			uint16_t *csump;
1311
1312			off = l2hlen +
1313			    M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data);
1314			m = hvn_pullup_hdr(m, off + sizeof(struct udphdr));
1315			if (m == NULL)
1316				return NULL;
1317
1318			csump = (uint16_t *)(mtod(m, uint8_t *) + off +
1319			    M_CSUM_DATA_IPv4_OFFSET(m->m_pkthdr.csum_data));
1320			*csump = cpu_in_cksum(m, m->m_pkthdr.len - off, off, 0);
1321			m->m_pkthdr.csum_flags &= ~M_CSUM_UDPv4;
1322		}
1323	}
1324#endif	/* INET */
1325#if defined(INET) && defined(INET6)
1326	else
1327#endif	/* INET && INET6 */
1328#if defined(INET6)
1329	{
1330		const struct ip6_hdr *ip6;
1331
1332		off = l2hlen + sizeof(*ip6);
1333		m = hvn_pullup_hdr(m, off);
1334		if (m == NULL)
1335			return NULL;
1336
1337		ip6 = (struct ip6_hdr *)((mtod(m, uint8_t *)) + l2hlen);
1338		if (ip6->ip6_nxt != IPPROTO_TCP &&
1339		    ip6->ip6_nxt != IPPROTO_UDP) {
1340			m_freem(m);
1341			return NULL;
1342		}
1343	}
1344#endif	/* INET6 */
1345
1346	*l2hlenp = l2hlen;
1347
1348	return m;
1349}
1350
1351static int
1352hvn_flush_txagg(struct hvn_tx_ring *txr)
1353{
1354	struct hvn_softc *sc = txr->txr_softc;
1355	struct ifnet *ifp = SC2IFP(sc);
1356	struct hvn_tx_desc *txd;
1357	struct mbuf *m;
1358	int error, pkts;
1359
1360	txd = txr->txr_agg_txd;
1361	KASSERTMSG(txd != NULL, "no aggregate txdesc");
1362
1363	/*
1364	 * Since hvn_txpkt() will reset this temporary stat, save
1365	 * it now, so that oerrors can be updated properly, if
1366	 * hvn_txpkt() ever fails.
1367	 */
1368	pkts = txr->txr_stat_pkts;
1369
1370	/*
1371	 * Since txd's mbuf will _not_ be freed upon hvn_txpkt()
1372	 * failure, save it for later freeing, if hvn_txpkt() ever
1373	 * fails.
1374	 */
1375	m = txd->txd_buf;
1376	error = hvn_txpkt(txr, txd);
1377	if (__predict_false(error)) {
1378		/* txd is freed, but m is not. */
1379		m_freem(m);
1380		txr->txr_evflushfailed.ev_count++;
1381		if_statadd(ifp, if_oerrors, pkts);
1382	}
1383
1384	/* Reset all aggregation states. */
1385	txr->txr_agg_txd = NULL;
1386	txr->txr_agg_szleft = 0;
1387	txr->txr_agg_pktleft = 0;
1388	txr->txr_agg_prevpkt = NULL;
1389
1390	return error;
1391}
1392
1393static void *
1394hvn_try_txagg(struct hvn_tx_ring *txr, struct hvn_tx_desc *txd, int pktsz)
1395{
1396	struct hvn_softc *sc = txr->txr_softc;
1397	struct hvn_tx_desc *agg_txd;
1398	struct rndis_packet_msg *pkt;
1399	void *chim;
1400	int olen;
1401
1402	if (txr->txr_agg_txd != NULL) {
1403		if (txr->txr_agg_pktleft > 0 && txr->txr_agg_szleft > pktsz) {
1404			agg_txd = txr->txr_agg_txd;
1405			pkt = txr->txr_agg_prevpkt;
1406
1407			/*
1408			 * Update the previous RNDIS packet's total length,
1409			 * it can be increased due to the mandatory alignment
1410			 * padding for this RNDIS packet.  And update the
1411			 * aggregating txdesc's chimney sending buffer size
1412			 * accordingly.
1413			 *
1414			 * XXX
1415			 * Zero-out the padding, as required by the RNDIS spec.
1416			 */
1417			olen = pkt->rm_len;
1418			pkt->rm_len = roundup2(olen, txr->txr_agg_align);
1419			agg_txd->txd_chim_size += pkt->rm_len - olen;
1420
1421			/* Link this txdesc to the parent. */
1422			hvn_txd_agg(agg_txd, txd);
1423
1424			chim = (uint8_t *)pkt + pkt->rm_len;
1425			/* Save the current packet for later fixup. */
1426			txr->txr_agg_prevpkt = chim;
1427
1428			txr->txr_agg_pktleft--;
1429			txr->txr_agg_szleft -= pktsz;
1430			if (txr->txr_agg_szleft <=
1431			    HVN_PKTSIZE_MIN(txr->txr_agg_align)) {
1432				/*
1433				 * Probably can't aggregate more packets,
1434				 * flush this aggregating txdesc proactively.
1435				 */
1436				txr->txr_agg_pktleft = 0;
1437			}
1438
1439			/* Done! */
1440			return chim;
1441		}
1442		hvn_flush_txagg(txr);
1443	}
1444
1445	txr->txr_evchimneytried.ev_count++;
1446	txd->txd_chim_index = hvn_chim_alloc(sc);
1447	if (txd->txd_chim_index == HVN_NVS_CHIM_IDX_INVALID)
1448		return NULL;
1449	txr->txr_evchimney.ev_count++;
1450
1451	chim = sc->sc_chim + (txd->txd_chim_index * sc->sc_chim_szmax);
1452
1453	if (txr->txr_agg_pktmax > 1 &&
1454	    txr->txr_agg_szmax > pktsz + HVN_PKTSIZE_MIN(txr->txr_agg_align)) {
1455		txr->txr_agg_txd = txd;
1456		txr->txr_agg_pktleft = txr->txr_agg_pktmax - 1;
1457		txr->txr_agg_szleft = txr->txr_agg_szmax - pktsz;
1458		txr->txr_agg_prevpkt = chim;
1459	}
1460
1461	return chim;
1462}
1463
1464static int
1465hvn_encap(struct hvn_tx_ring *txr, struct hvn_tx_desc *txd, struct mbuf *m,
1466    int l2hlen)
1467{
1468	/* Used to pad ethernet frames with < ETHER_MIN_LEN bytes */
1469	static const char zero_pad[ETHER_MIN_LEN];
1470	struct hvn_softc *sc = txr->txr_softc;
1471	struct rndis_packet_msg *pkt;
1472	bus_dma_segment_t *seg;
1473	void *chim = NULL;
1474	size_t pktlen, pktsize;
1475	int l3hlen;
1476	int i, rv;
1477
1478	if (ISSET(sc->sc_caps, HVN_CAPS_VLAN) && !vlan_has_tag(m)) {
1479		struct ether_vlan_header *evl;
1480
1481		m = hvn_pullup_hdr(m, sizeof(*evl));
1482		if (m == NULL) {
1483			DPRINTF("%s: failed to pullup mbuf\n",
1484			    device_xname(sc->sc_dev));
1485			return -1;
1486		}
1487
1488		evl = mtod(m, struct ether_vlan_header *);
1489		if (evl->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1490			struct ether_header *eh;
1491			uint16_t proto = evl->evl_proto;
1492
1493			vlan_set_tag(m, ntohs(evl->evl_tag));
1494
1495			/*
1496			 * Trim VLAN tag from header.
1497			 */
1498			memmove((uint8_t *)evl + ETHER_VLAN_ENCAP_LEN,
1499			    evl, ETHER_HDR_LEN);
1500			m_adj(m, ETHER_VLAN_ENCAP_LEN);
1501
1502			eh = mtod(m, struct ether_header *);
1503			eh->ether_type = proto;
1504
1505			/*
1506			 * Re-padding.  See sys/net/if_vlan.c:vlan_start().
1507			 */
1508			if (m->m_pkthdr.len < (ETHER_MIN_LEN - ETHER_CRC_LEN +
1509			    ETHER_VLAN_ENCAP_LEN)) {
1510				m_copyback(m, m->m_pkthdr.len,
1511				    (ETHER_MIN_LEN - ETHER_CRC_LEN +
1512				     ETHER_VLAN_ENCAP_LEN) -
1513				    m->m_pkthdr.len, zero_pad);
1514			}
1515
1516			txr->txr_evvlanfixup.ev_count++;
1517		}
1518	}
1519
1520	pkt = txd->txd_req;
1521	pktsize = HVN_PKTSIZE(m, txr->txr_agg_align);
1522	if (pktsize < txr->txr_chim_size) {
1523		chim = hvn_try_txagg(txr, txd, pktsize);
1524		if (chim != NULL)
1525			pkt = chim;
1526	} else {
1527		if (txr->txr_agg_txd != NULL)
1528			hvn_flush_txagg(txr);
1529	}
1530
1531	memset(pkt, 0, HVN_RNDIS_PKT_LEN);
1532	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1533	pkt->rm_len = sizeof(*pkt) + m->m_pkthdr.len;
1534	pkt->rm_dataoffset = RNDIS_DATA_OFFSET;
1535	pkt->rm_datalen = m->m_pkthdr.len;
1536	pkt->rm_pktinfooffset = sizeof(*pkt); /* adjusted below */
1537	pkt->rm_pktinfolen = 0;
1538
1539	if (txr->txr_flags & HVN_TXR_FLAG_UDP_HASH) {
1540		char *cp;
1541
1542		/*
1543		 * Set the hash value for this packet, so that the host could
1544		 * dispatch the TX done event for this packet back to this TX
1545		 * ring's channel.
1546		 */
1547		cp = hvn_rndis_pktinfo_append(pkt, HVN_RNDIS_PKT_LEN,
1548		    HVN_NDIS_HASH_VALUE_SIZE, HVN_NDIS_PKTINFO_TYPE_HASHVAL);
1549		memcpy(cp, &txr->txr_id, HVN_NDIS_HASH_VALUE_SIZE);
1550	}
1551
1552	if (vlan_has_tag(m)) {
1553		uint32_t vlan;
1554		char *cp;
1555		uint16_t tag;
1556
1557		tag = vlan_get_tag(m);
1558		vlan = NDIS_VLAN_INFO_MAKE(EVL_VLANOFTAG(tag),
1559		    EVL_PRIOFTAG(tag), EVL_CFIOFTAG(tag));
1560		cp = hvn_rndis_pktinfo_append(pkt, HVN_RNDIS_PKT_LEN,
1561		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1562		memcpy(cp, &vlan, NDIS_VLAN_INFO_SIZE);
1563		txr->txr_evvlanhwtagging.ev_count++;
1564	}
1565
1566	if (m->m_pkthdr.csum_flags & txr->txr_csum_assist) {
1567		uint32_t csum;
1568		char *cp;
1569
1570		if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv6 | M_CSUM_UDPv6)) {
1571			csum = NDIS_TXCSUM_INFO_IPV6;
1572			l3hlen = M_CSUM_DATA_IPv6_IPHL(m->m_pkthdr.csum_data);
1573			if (m->m_pkthdr.csum_flags & M_CSUM_TCPv6)
1574				csum |= NDIS_TXCSUM_INFO_MKTCPCS(l2hlen +
1575				    l3hlen);
1576			if (m->m_pkthdr.csum_flags & M_CSUM_UDPv6)
1577				csum |= NDIS_TXCSUM_INFO_MKUDPCS(l2hlen +
1578				    l3hlen);
1579		} else {
1580			csum = NDIS_TXCSUM_INFO_IPV4;
1581			l3hlen = M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data);
1582			if (m->m_pkthdr.csum_flags & M_CSUM_IPv4)
1583				csum |= NDIS_TXCSUM_INFO_IPCS;
1584			if (m->m_pkthdr.csum_flags & M_CSUM_TCPv4)
1585				csum |= NDIS_TXCSUM_INFO_MKTCPCS(l2hlen +
1586				    l3hlen);
1587			if (m->m_pkthdr.csum_flags & M_CSUM_UDPv4)
1588				csum |= NDIS_TXCSUM_INFO_MKUDPCS(l2hlen +
1589				    l3hlen);
1590		}
1591		cp = hvn_rndis_pktinfo_append(pkt, HVN_RNDIS_PKT_LEN,
1592		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1593		memcpy(cp, &csum, NDIS_TXCSUM_INFO_SIZE);
1594	}
1595
1596	pktlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1597	pkt->rm_pktinfooffset -= RNDIS_HEADER_OFFSET;
1598
1599	/*
1600	 * Fast path: Chimney sending.
1601	 */
1602	if (chim != NULL) {
1603		struct hvn_tx_desc *tgt_txd;
1604
1605		tgt_txd = (txr->txr_agg_txd != NULL) ? txr->txr_agg_txd : txd;
1606
1607		KASSERTMSG(pkt == chim,
1608		    "RNDIS pkt not in chimney sending buffer");
1609		KASSERTMSG(tgt_txd->txd_chim_index != HVN_NVS_CHIM_IDX_INVALID,
1610		    "chimney sending buffer is not used");
1611
1612		tgt_txd->txd_chim_size += pkt->rm_len;
1613		m_copydata(m, 0, m->m_pkthdr.len, (uint8_t *)chim + pktlen);
1614
1615		txr->txr_sendpkt = hvn_rndis_output_chim;
1616		goto done;
1617	}
1618
1619	KASSERTMSG(txr->txr_agg_txd == NULL, "aggregating sglist txdesc");
1620	KASSERTMSG(txd->txd_chim_index == HVN_NVS_CHIM_IDX_INVALID,
1621	    "chimney buffer is used");
1622	KASSERTMSG(pkt == txd->txd_req, "RNDIS pkt not in txdesc");
1623
1624	rv = bus_dmamap_load_mbuf(sc->sc_dmat, txd->txd_dmap, m, BUS_DMA_READ |
1625	    BUS_DMA_NOWAIT);
1626	switch (rv) {
1627	case 0:
1628		break;
1629	case EFBIG:
1630		if (m_defrag(m, M_NOWAIT) != NULL) {
1631			txr->txr_evdefrag.ev_count++;
1632			if (bus_dmamap_load_mbuf(sc->sc_dmat, txd->txd_dmap, m,
1633			    BUS_DMA_READ | BUS_DMA_NOWAIT) == 0)
1634				break;
1635		}
1636		/* FALLTHROUGH */
1637	default:
1638		DPRINTF("%s: failed to load mbuf\n", device_xname(sc->sc_dev));
1639		txr->txr_evdmafailed.ev_count++;
1640		return -1;
1641	}
1642	bus_dmamap_sync(sc->sc_dmat, txd->txd_dmap,
1643	    0, txd->txd_dmap->dm_mapsize, BUS_DMASYNC_PREWRITE);
1644	SET(txd->txd_flags, HVN_TXD_FLAG_DMAMAP);
1645
1646	/* Attach an RNDIS message to the first slot */
1647	txd->txd_sgl[0].gpa_page = txd->txd_gpa.gpa_page;
1648	txd->txd_sgl[0].gpa_ofs = txd->txd_gpa.gpa_ofs;
1649	txd->txd_sgl[0].gpa_len = pktlen;
1650	txd->txd_nsge = txd->txd_dmap->dm_nsegs + 1;
1651
1652	for (i = 0; i < txd->txd_dmap->dm_nsegs; i++) {
1653		seg = &txd->txd_dmap->dm_segs[i];
1654		txd->txd_sgl[1 + i].gpa_page = atop(seg->ds_addr);
1655		txd->txd_sgl[1 + i].gpa_ofs = seg->ds_addr & PAGE_MASK;
1656		txd->txd_sgl[1 + i].gpa_len = seg->ds_len;
1657	}
1658
1659	txd->txd_chim_index = HVN_NVS_CHIM_IDX_INVALID;
1660	txd->txd_chim_size = 0;
1661	txr->txr_sendpkt = hvn_rndis_output_sgl;
1662done:
1663	txd->txd_buf = m;
1664
1665	/* Update temporary stats for later use. */
1666	txr->txr_stat_pkts++;
1667	txr->txr_stat_size += m->m_pkthdr.len;
1668	if (m->m_flags & M_MCAST)
1669		txr->txr_stat_mcasts++;
1670
1671	return 0;
1672}
1673
1674static void
1675hvn_bpf_mtap(struct hvn_tx_ring *txr, struct mbuf *m, u_int direction)
1676{
1677	struct hvn_softc *sc = txr->txr_softc;
1678	struct ifnet *ifp = SC2IFP(sc);
1679	struct ether_header *eh;
1680	struct ether_vlan_header evl;
1681
1682	if (!vlan_has_tag(m)) {
1683		bpf_mtap(ifp, m, direction);
1684		return;
1685	}
1686
1687	if (ifp->if_bpf == NULL)
1688		return;
1689
1690	txr->txr_evvlantap.ev_count++;
1691
1692	/*
1693	 * Restore a VLAN tag for bpf.
1694	 *
1695	 * Do not modify contents of the original mbuf,
1696	 * because Tx processing on the mbuf is still in progress.
1697	 */
1698
1699	eh = mtod(m, struct ether_header *);
1700	memcpy(evl.evl_dhost, eh->ether_dhost, ETHER_ADDR_LEN * 2);
1701	evl.evl_encap_proto = htons(ETHERTYPE_VLAN);
1702	evl.evl_tag = htons(vlan_get_tag(m));
1703	evl.evl_proto = eh->ether_type;
1704
1705	/* Do not tap ether header of the original mbuf. */
1706	m_adj(m, sizeof(*eh));
1707
1708	bpf_mtap2(ifp->if_bpf, &evl, sizeof(evl), m, direction);
1709
1710	/* Cannot restore ether header of the original mbuf,
1711	 * but do not worry about it because just free it. */
1712}
1713
1714static int
1715hvn_txpkt(struct hvn_tx_ring *txr, struct hvn_tx_desc *txd)
1716{
1717	struct hvn_softc *sc = txr->txr_softc;
1718	struct ifnet *ifp = SC2IFP(sc);
1719	const struct hvn_tx_desc *tmp_txd;
1720	int error;
1721
1722	/*
1723	 * Make sure that this txd and any aggregated txds are not
1724	 * freed before bpf_mtap.
1725	 */
1726	hvn_txd_hold(txd);
1727
1728	error = (*txr->txr_sendpkt)(txr, txd);
1729	if (error == 0) {
1730		hvn_bpf_mtap(txr, txd->txd_buf, BPF_D_OUT);
1731		STAILQ_FOREACH(tmp_txd, &txd->txd_agg_list, txd_agg_entry)
1732			hvn_bpf_mtap(txr, tmp_txd->txd_buf, BPF_D_OUT);
1733
1734		if_statadd(ifp, if_opackets, txr->txr_stat_pkts);
1735		if_statadd(ifp, if_obytes, txr->txr_stat_size);
1736		if (txr->txr_stat_mcasts != 0)
1737			if_statadd(ifp, if_omcasts, txr->txr_stat_mcasts);
1738		txr->txr_evpkts.ev_count += txr->txr_stat_pkts;
1739		txr->txr_evsends.ev_count++;
1740	}
1741
1742	hvn_txd_put(txr, txd);
1743
1744	if (__predict_false(error)) {
1745		/*
1746		 * Caller will perform further processing on the
1747		 * associated mbuf, so don't free it in hvn_txd_put();
1748		 * only unload it from the DMA map in hvn_txd_put(),
1749		 * if it was loaded.
1750		 */
1751		txd->txd_buf = NULL;
1752		hvn_txd_put(txr, txd);
1753	}
1754
1755	/* Reset temporary stats, after this sending is done. */
1756	txr->txr_stat_pkts = 0;
1757	txr->txr_stat_size = 0;
1758	txr->txr_stat_mcasts = 0;
1759
1760	return error;
1761}
1762
1763static void
1764hvn_txeof(struct hvn_tx_ring *txr, uint64_t tid)
1765{
1766	struct hvn_softc *sc = txr->txr_softc;
1767	struct hvn_tx_desc *txd;
1768	uint32_t id = tid >> 32;
1769
1770	if ((tid & 0xffffffffU) != 0)
1771		return;
1772
1773	id -= HVN_NVS_CHIM_SIG;
1774	if (id >= HVN_TX_DESC) {
1775		device_printf(sc->sc_dev, "tx packet index too large: %u", id);
1776		return;
1777	}
1778
1779	txd = &txr->txr_desc[id];
1780
1781	if (txd->txd_buf == NULL)
1782		device_printf(sc->sc_dev, "no mbuf @%u\n", id);
1783
1784	hvn_txd_put(txr, txd);
1785}
1786
1787static int
1788hvn_rx_ring_create(struct hvn_softc *sc, int ring_cnt)
1789{
1790	struct hvn_rx_ring *rxr;
1791	int i;
1792
1793	if (sc->sc_proto <= HVN_NVS_PROTO_VERSION_2)
1794		sc->sc_rx_size = 15 * 1024 * 1024;	/* 15MB */
1795	else
1796		sc->sc_rx_size = 16 * 1024 * 1024; 	/* 16MB */
1797	sc->sc_rx_ring = hyperv_dma_alloc(sc->sc_dmat, &sc->sc_rx_dma,
1798	    sc->sc_rx_size, PAGE_SIZE, PAGE_SIZE, sc->sc_rx_size / PAGE_SIZE);
1799	if (sc->sc_rx_ring == NULL) {
1800		DPRINTF("%s: failed to allocate Rx ring buffer\n",
1801		    device_xname(sc->sc_dev));
1802		return -1;
1803	}
1804
1805	sc->sc_rxr = kmem_zalloc(sizeof(*rxr) * ring_cnt, KM_SLEEP);
1806	sc->sc_nrxr_inuse = sc->sc_nrxr = ring_cnt;
1807
1808	for (i = 0; i < sc->sc_nrxr; i++) {
1809		rxr = &sc->sc_rxr[i];
1810		rxr->rxr_softc = sc;
1811		if (i < sc->sc_ntxr) {
1812			rxr->rxr_txr = &sc->sc_txr[i];
1813			rxr->rxr_txr->txr_rxr = rxr;
1814		}
1815
1816		mutex_init(&rxr->rxr_lock, MUTEX_DEFAULT, IPL_NET);
1817		mutex_init(&rxr->rxr_onwork_lock, MUTEX_DEFAULT, IPL_NET);
1818		cv_init(&rxr->rxr_onwork_cv, "waitonwk");
1819
1820		snprintf(rxr->rxr_name, sizeof(rxr->rxr_name),
1821		    "%s-rx%d", device_xname(sc->sc_dev), i);
1822		evcnt_attach_dynamic(&rxr->rxr_evpkts, EVCNT_TYPE_MISC,
1823		    NULL, rxr->rxr_name, "packets received");
1824		evcnt_attach_dynamic(&rxr->rxr_evcsum_ip, EVCNT_TYPE_MISC,
1825		    NULL, rxr->rxr_name, "IP checksum");
1826		evcnt_attach_dynamic(&rxr->rxr_evcsum_tcp, EVCNT_TYPE_MISC,
1827		    NULL, rxr->rxr_name, "TCP checksum");
1828		evcnt_attach_dynamic(&rxr->rxr_evcsum_udp, EVCNT_TYPE_MISC,
1829		    NULL, rxr->rxr_name, "UDP checksum");
1830		evcnt_attach_dynamic(&rxr->rxr_evvlanhwtagging, EVCNT_TYPE_MISC,
1831		    NULL, rxr->rxr_name, "VLAN H/W tagging");
1832		evcnt_attach_dynamic(&rxr->rxr_evintr, EVCNT_TYPE_INTR,
1833		    NULL, rxr->rxr_name, "interrupt on ring");
1834		evcnt_attach_dynamic(&rxr->rxr_evdefer, EVCNT_TYPE_MISC,
1835		    NULL, rxr->rxr_name, "handled queue in workqueue");
1836		evcnt_attach_dynamic(&rxr->rxr_evdeferreq, EVCNT_TYPE_MISC,
1837		    NULL, rxr->rxr_name, "requested defer on ring");
1838		evcnt_attach_dynamic(&rxr->rxr_evredeferreq, EVCNT_TYPE_MISC,
1839		    NULL, rxr->rxr_name, "requested defer in workqueue");
1840
1841		rxr->rxr_nvsbuf = kmem_zalloc(HVN_NVS_BUFSIZE, KM_SLEEP);
1842		if (rxr->rxr_nvsbuf == NULL) {
1843			DPRINTF("%s: failed to allocate channel data buffer\n",
1844			    device_xname(sc->sc_dev));
1845			goto errout;
1846		}
1847
1848		rxr->rxr_si = softint_establish(SOFTINT_NET | SOFTINT_MPSAFE,
1849		    hvn_nvs_softintr, rxr);
1850		if (rxr->rxr_si == NULL) {
1851			DPRINTF("%s: failed to establish rx softint\n",
1852			    device_xname(sc->sc_dev));
1853			goto errout;
1854		}
1855	}
1856
1857	return 0;
1858
1859 errout:
1860	hvn_rx_ring_destroy(sc);
1861	return -1;
1862}
1863
1864static int
1865hvn_rx_ring_destroy(struct hvn_softc *sc)
1866{
1867	struct hvn_rx_ring *rxr;
1868	int i;
1869
1870	if (sc->sc_rxr != NULL) {
1871		for (i = 0; i < sc->sc_nrxr; i++) {
1872			rxr = &sc->sc_rxr[i];
1873
1874			if (rxr->rxr_si != NULL) {
1875				softint_disestablish(rxr->rxr_si);
1876				rxr->rxr_si = NULL;
1877			}
1878
1879			if (rxr->rxr_nvsbuf != NULL) {
1880				kmem_free(rxr->rxr_nvsbuf, HVN_NVS_BUFSIZE);
1881				rxr->rxr_nvsbuf = NULL;
1882			}
1883
1884			evcnt_detach(&rxr->rxr_evpkts);
1885			evcnt_detach(&rxr->rxr_evcsum_ip);
1886			evcnt_detach(&rxr->rxr_evcsum_tcp);
1887			evcnt_detach(&rxr->rxr_evcsum_udp);
1888			evcnt_detach(&rxr->rxr_evvlanhwtagging);
1889			evcnt_detach(&rxr->rxr_evintr);
1890			evcnt_detach(&rxr->rxr_evdefer);
1891			evcnt_detach(&rxr->rxr_evdeferreq);
1892			evcnt_detach(&rxr->rxr_evredeferreq);
1893
1894			cv_destroy(&rxr->rxr_onwork_cv);
1895			mutex_destroy(&rxr->rxr_onwork_lock);
1896			mutex_destroy(&rxr->rxr_lock);
1897		}
1898		kmem_free(sc->sc_rxr, sizeof(*rxr) * sc->sc_nrxr);
1899		sc->sc_rxr = NULL;
1900		sc->sc_nrxr = 0;
1901	}
1902	if (sc->sc_rx_ring != NULL) {
1903		hyperv_dma_free(sc->sc_dmat, &sc->sc_rx_dma);
1904		sc->sc_rx_ring = NULL;
1905	}
1906
1907	return 0;
1908}
1909
1910static void
1911hvn_fixup_rx_data(struct hvn_softc *sc)
1912{
1913	struct hvn_rx_ring *rxr;
1914	int i;
1915
1916	if (sc->sc_caps & HVN_CAPS_UDPHASH) {
1917		for (i = 0; i < sc->sc_nrxr; i++) {
1918			rxr = &sc->sc_rxr[i];
1919			rxr->rxr_flags |= HVN_RXR_FLAG_UDP_HASH;
1920		}
1921	}
1922}
1923
1924static int
1925hvn_tx_ring_create(struct hvn_softc *sc, int ring_cnt)
1926{
1927	struct hvn_tx_ring *txr;
1928	struct hvn_tx_desc *txd;
1929	bus_dma_segment_t *seg;
1930	size_t msgsize;
1931	int i, j;
1932	paddr_t pa;
1933
1934	/*
1935	 * Create TXBUF for chimney sending.
1936	 *
1937	 * NOTE: It is shared by all channels.
1938	 */
1939	sc->sc_chim = hyperv_dma_alloc(sc->sc_dmat, &sc->sc_chim_dma,
1940	    HVN_CHIM_SIZE, PAGE_SIZE, 0, 1);
1941	if (sc->sc_chim == NULL) {
1942		DPRINTF("%s: failed to allocate chimney sending memory",
1943		    device_xname(sc->sc_dev));
1944		goto errout;
1945	}
1946
1947	sc->sc_txr = kmem_zalloc(sizeof(*txr) * ring_cnt, KM_SLEEP);
1948	sc->sc_ntxr_inuse = sc->sc_ntxr = ring_cnt;
1949
1950	msgsize = roundup(HVN_RNDIS_PKT_LEN, 128);
1951
1952	for (j = 0; j < ring_cnt; j++) {
1953		txr = &sc->sc_txr[j];
1954		txr->txr_softc = sc;
1955		txr->txr_id = j;
1956
1957		mutex_init(&txr->txr_lock, MUTEX_DEFAULT, IPL_NET);
1958		txr->txr_interq = pcq_create(HVN_TX_DESC, KM_SLEEP);
1959
1960		snprintf(txr->txr_name, sizeof(txr->txr_name),
1961		    "%s-tx%d", device_xname(sc->sc_dev), j);
1962		evcnt_attach_dynamic(&txr->txr_evpkts, EVCNT_TYPE_MISC,
1963		    NULL, txr->txr_name, "packets transmit");
1964		evcnt_attach_dynamic(&txr->txr_evsends, EVCNT_TYPE_MISC,
1965		    NULL, txr->txr_name, "sends");
1966		evcnt_attach_dynamic(&txr->txr_evnodesc, EVCNT_TYPE_MISC,
1967		    NULL, txr->txr_name, "descriptor shortage");
1968		evcnt_attach_dynamic(&txr->txr_evdmafailed, EVCNT_TYPE_MISC,
1969		    NULL, txr->txr_name, "DMA failure");
1970		evcnt_attach_dynamic(&txr->txr_evdefrag, EVCNT_TYPE_MISC,
1971		    NULL, txr->txr_name, "mbuf defraged");
1972		evcnt_attach_dynamic(&txr->txr_evpcqdrop, EVCNT_TYPE_MISC,
1973		    NULL, txr->txr_name, "dropped in pcq");
1974		evcnt_attach_dynamic(&txr->txr_evtransmitdefer, EVCNT_TYPE_MISC,
1975		    NULL, txr->txr_name, "deferred transmit");
1976		evcnt_attach_dynamic(&txr->txr_evflushfailed, EVCNT_TYPE_MISC,
1977		    NULL, txr->txr_name, "aggregation flush failure");
1978		evcnt_attach_dynamic(&txr->txr_evchimneytried, EVCNT_TYPE_MISC,
1979		    NULL, txr->txr_name, "chimney send tried");
1980		evcnt_attach_dynamic(&txr->txr_evchimney, EVCNT_TYPE_MISC,
1981		    NULL, txr->txr_name, "chimney send");
1982		evcnt_attach_dynamic(&txr->txr_evvlanfixup, EVCNT_TYPE_MISC,
1983		    NULL, txr->txr_name, "VLAN fixup");
1984		evcnt_attach_dynamic(&txr->txr_evvlanhwtagging, EVCNT_TYPE_MISC,
1985		    NULL, txr->txr_name, "VLAN H/W tagging");
1986		evcnt_attach_dynamic(&txr->txr_evvlantap, EVCNT_TYPE_MISC,
1987		    NULL, txr->txr_name, "VLAN bpf_mtap fixup");
1988
1989		txr->txr_si = softint_establish(SOFTINT_NET | SOFTINT_MPSAFE,
1990		    hvn_deferred_transmit, txr);
1991		if (txr->txr_si == NULL) {
1992			aprint_error_dev(sc->sc_dev,
1993			    "failed to establish softint for tx ring\n");
1994			goto errout;
1995		}
1996
1997		/* Allocate memory to store RNDIS messages */
1998		txr->txr_msgs = hyperv_dma_alloc(sc->sc_dmat, &txr->txr_dma,
1999		    msgsize * HVN_TX_DESC, PAGE_SIZE, 0, 1);
2000		if (txr->txr_msgs == NULL) {
2001			DPRINTF("%s: failed to allocate memory for RDNIS "
2002			    "messages\n", device_xname(sc->sc_dev));
2003			goto errout;
2004		}
2005
2006		TAILQ_INIT(&txr->txr_list);
2007		for (i = 0; i < HVN_TX_DESC; i++) {
2008			txd = &txr->txr_desc[i];
2009			txd->txd_chim_index = HVN_NVS_CHIM_IDX_INVALID;
2010			txd->txd_chim_size = 0;
2011			STAILQ_INIT(&txd->txd_agg_list);
2012			if (bus_dmamap_create(sc->sc_dmat, HVN_TX_PKT_SIZE,
2013			    HVN_TX_FRAGS, HVN_TX_FRAG_SIZE, PAGE_SIZE,
2014			    BUS_DMA_WAITOK, &txd->txd_dmap)) {
2015				DPRINTF("%s: failed to create map for TX "
2016				    "descriptors\n", device_xname(sc->sc_dev));
2017				goto errout;
2018			}
2019			seg = &txr->txr_dma.map->dm_segs[0];
2020			pa = seg->ds_addr + (msgsize * i);
2021			txd->txd_gpa.gpa_page = atop(pa);
2022			txd->txd_gpa.gpa_ofs = pa & PAGE_MASK;
2023			txd->txd_gpa.gpa_len = msgsize;
2024			txd->txd_req = (void *)(txr->txr_msgs + (msgsize * i));
2025			txd->txd_id = i + HVN_NVS_CHIM_SIG;
2026			TAILQ_INSERT_TAIL(&txr->txr_list, txd, txd_entry);
2027		}
2028		txr->txr_avail = HVN_TX_DESC;
2029	}
2030
2031	return 0;
2032
2033 errout:
2034	hvn_tx_ring_destroy(sc);
2035	return -1;
2036}
2037
2038static void
2039hvn_tx_ring_destroy(struct hvn_softc *sc)
2040{
2041	struct hvn_tx_ring *txr;
2042	struct hvn_tx_desc *txd;
2043	int i, j;
2044
2045	if (sc->sc_txr != NULL) {
2046		for (j = 0; j < sc->sc_ntxr; j++) {
2047			txr = &sc->sc_txr[j];
2048
2049			mutex_enter(&txr->txr_lock);
2050			for (i = 0; i < HVN_TX_DESC; i++) {
2051				txd = &txr->txr_desc[i];
2052				hvn_txd_gc(txr, txd);
2053			}
2054			mutex_exit(&txr->txr_lock);
2055			for (i = 0; i < HVN_TX_DESC; i++) {
2056				txd = &txr->txr_desc[i];
2057				if (txd->txd_dmap != NULL) {
2058					bus_dmamap_destroy(sc->sc_dmat,
2059					    txd->txd_dmap);
2060					txd->txd_dmap = NULL;
2061				}
2062			}
2063			if (txr->txr_msgs != NULL) {
2064				hyperv_dma_free(sc->sc_dmat, &txr->txr_dma);
2065				txr->txr_msgs = NULL;
2066			}
2067			if (txr->txr_si != NULL) {
2068				softint_disestablish(txr->txr_si);
2069				txr->txr_si = NULL;
2070			}
2071			if (txr->txr_interq != NULL) {
2072				hvn_tx_ring_qflush(sc, txr);
2073				pcq_destroy(txr->txr_interq);
2074				txr->txr_interq = NULL;
2075			}
2076
2077			evcnt_detach(&txr->txr_evpkts);
2078			evcnt_detach(&txr->txr_evsends);
2079			evcnt_detach(&txr->txr_evnodesc);
2080			evcnt_detach(&txr->txr_evdmafailed);
2081			evcnt_detach(&txr->txr_evdefrag);
2082			evcnt_detach(&txr->txr_evpcqdrop);
2083			evcnt_detach(&txr->txr_evtransmitdefer);
2084			evcnt_detach(&txr->txr_evflushfailed);
2085			evcnt_detach(&txr->txr_evchimneytried);
2086			evcnt_detach(&txr->txr_evchimney);
2087			evcnt_detach(&txr->txr_evvlanfixup);
2088			evcnt_detach(&txr->txr_evvlanhwtagging);
2089			evcnt_detach(&txr->txr_evvlantap);
2090
2091			mutex_destroy(&txr->txr_lock);
2092		}
2093
2094		kmem_free(sc->sc_txr, sizeof(*txr) * sc->sc_ntxr);
2095		sc->sc_txr = NULL;
2096	}
2097
2098	if (sc->sc_chim != NULL) {
2099		hyperv_dma_free(sc->sc_dmat, &sc->sc_chim_dma);
2100		sc->sc_chim = NULL;
2101	}
2102}
2103
2104static void
2105hvn_set_chim_size(struct hvn_softc *sc, int chim_size)
2106{
2107	struct hvn_tx_ring *txr;
2108	int i;
2109
2110	for (i = 0; i < sc->sc_ntxr_inuse; i++) {
2111		txr = &sc->sc_txr[i];
2112		txr->txr_chim_size = chim_size;
2113	}
2114}
2115
2116#if LONG_BIT == 64
2117#define ffsl(v)	ffs64(v)
2118#elif LONG_BIT == 32
2119#define ffsl(v)	ffs32(v)
2120#else
2121#error unsupport LONG_BIT
2122#endif  /* LONG_BIT */
2123
2124static uint32_t
2125hvn_chim_alloc(struct hvn_softc *sc)
2126{
2127	uint32_t chim_idx = HVN_NVS_CHIM_IDX_INVALID;
2128	int i, idx;
2129
2130	mutex_spin_enter(&sc->sc_chim_bmap_lock);
2131	for (i = 0; i < sc->sc_chim_bmap_cnt; i++) {
2132		idx = ffsl(~sc->sc_chim_bmap[i]);
2133		if (idx == 0)
2134			continue;
2135
2136		--idx;	/* ffsl is 1-based */
2137		SET(sc->sc_chim_bmap[i], __BIT(idx));
2138
2139		chim_idx = i * LONG_BIT + idx;
2140		break;
2141	}
2142	mutex_spin_exit(&sc->sc_chim_bmap_lock);
2143
2144	return chim_idx;
2145}
2146
2147static void
2148hvn_chim_free(struct hvn_softc *sc, uint32_t chim_idx)
2149{
2150	u_long mask;
2151	uint32_t idx;
2152
2153	idx = chim_idx / LONG_BIT;
2154	mask = __BIT(chim_idx % LONG_BIT);
2155
2156	mutex_spin_enter(&sc->sc_chim_bmap_lock);
2157	CLR(sc->sc_chim_bmap[idx], mask);
2158	mutex_spin_exit(&sc->sc_chim_bmap_lock);
2159}
2160
2161static void
2162hvn_fixup_tx_data(struct hvn_softc *sc)
2163{
2164	struct hvn_tx_ring *txr;
2165	uint64_t caps_assist;
2166	int csum_assist;
2167	int i;
2168
2169	hvn_set_chim_size(sc, sc->sc_chim_szmax);
2170	if (hvn_tx_chimney_size > 0 && hvn_tx_chimney_size < sc->sc_chim_szmax)
2171		hvn_set_chim_size(sc, hvn_tx_chimney_size);
2172
2173	caps_assist = 0;
2174	csum_assist = 0;
2175	if (sc->sc_caps & HVN_CAPS_IPCS) {
2176		caps_assist |= IFCAP_CSUM_IPv4_Tx;
2177		caps_assist |= IFCAP_CSUM_IPv4_Rx;
2178		csum_assist |= M_CSUM_IPv4;
2179	}
2180	if (sc->sc_caps & HVN_CAPS_TCP4CS) {
2181		caps_assist |= IFCAP_CSUM_TCPv4_Tx;
2182		caps_assist |= IFCAP_CSUM_TCPv4_Rx;
2183		csum_assist |= M_CSUM_TCPv4;
2184	}
2185	if (sc->sc_caps &  HVN_CAPS_TCP6CS) {
2186		caps_assist |= IFCAP_CSUM_TCPv6_Tx;
2187		csum_assist |= M_CSUM_TCPv6;
2188	}
2189	if (sc->sc_caps & HVN_CAPS_UDP4CS) {
2190		caps_assist |= IFCAP_CSUM_UDPv4_Tx;
2191		caps_assist |= IFCAP_CSUM_UDPv4_Rx;
2192		csum_assist |= M_CSUM_UDPv4;
2193	}
2194	if (sc->sc_caps & HVN_CAPS_UDP6CS) {
2195		caps_assist |= IFCAP_CSUM_UDPv6_Tx;
2196		csum_assist |= M_CSUM_UDPv6;
2197	}
2198	for (i = 0; i < sc->sc_ntxr; i++) {
2199		txr = &sc->sc_txr[i];
2200		txr->txr_caps_assist = caps_assist;
2201		txr->txr_csum_assist = csum_assist;
2202	}
2203
2204	if (sc->sc_caps & HVN_CAPS_UDPHASH) {
2205		for (i = 0; i < sc->sc_ntxr; i++) {
2206			txr = &sc->sc_txr[i];
2207			txr->txr_flags |= HVN_TXR_FLAG_UDP_HASH;
2208		}
2209	}
2210}
2211
2212static int
2213hvn_txd_peek(struct hvn_tx_ring *txr)
2214{
2215
2216	KASSERT(mutex_owned(&txr->txr_lock));
2217
2218	return txr->txr_avail;
2219}
2220
2221static struct hvn_tx_desc *
2222hvn_txd_get(struct hvn_tx_ring *txr)
2223{
2224	struct hvn_tx_desc *txd;
2225
2226	KASSERT(mutex_owned(&txr->txr_lock));
2227
2228	txd = TAILQ_FIRST(&txr->txr_list);
2229	KASSERT(txd != NULL);
2230	TAILQ_REMOVE(&txr->txr_list, txd, txd_entry);
2231	txr->txr_avail--;
2232
2233	txd->txd_refs = 1;
2234
2235	return txd;
2236}
2237
2238static void
2239hvn_txd_put(struct hvn_tx_ring *txr, struct hvn_tx_desc *txd)
2240{
2241	struct hvn_softc *sc = txr->txr_softc;
2242	struct hvn_tx_desc *tmp_txd;
2243
2244	KASSERT(mutex_owned(&txr->txr_lock));
2245	KASSERTMSG(!ISSET(txd->txd_flags, HVN_TXD_FLAG_ONAGG),
2246	    "put an onagg txd %#x", txd->txd_flags);
2247
2248	KASSERTMSG(txd->txd_refs > 0, "invalid txd refs %d", txd->txd_refs);
2249	if (atomic_dec_uint_nv(&txd->txd_refs) != 0)
2250		return;
2251
2252	if (!STAILQ_EMPTY(&txd->txd_agg_list)) {
2253		while ((tmp_txd = STAILQ_FIRST(&txd->txd_agg_list)) != NULL) {
2254			KASSERTMSG(STAILQ_EMPTY(&tmp_txd->txd_agg_list),
2255			    "resursive aggregation on aggregated txdesc");
2256			KASSERTMSG(
2257			    ISSET(tmp_txd->txd_flags, HVN_TXD_FLAG_ONAGG),
2258			    "not aggregated txdesc");
2259			KASSERTMSG(
2260			    tmp_txd->txd_chim_index == HVN_NVS_CHIM_IDX_INVALID,
2261			    "aggregated txdesc consumes chimney sending "
2262			    "buffer: idx %u", tmp_txd->txd_chim_index);
2263			KASSERTMSG(tmp_txd->txd_chim_size == 0,
2264			    "aggregated txdesc has non-zero chimney sending "
2265			    "size: sz %u", tmp_txd->txd_chim_size);
2266
2267			STAILQ_REMOVE_HEAD(&txd->txd_agg_list, txd_agg_entry);
2268			CLR(tmp_txd->txd_flags, HVN_TXD_FLAG_ONAGG);
2269			hvn_txd_put(txr, tmp_txd);
2270		}
2271	}
2272
2273	if (txd->txd_chim_index != HVN_NVS_CHIM_IDX_INVALID) {
2274		KASSERTMSG(!ISSET(txd->txd_flags, HVN_TXD_FLAG_DMAMAP),
2275		    "chim txd uses dmamap");
2276		hvn_chim_free(sc, txd->txd_chim_index);
2277		txd->txd_chim_index = HVN_NVS_CHIM_IDX_INVALID;
2278		txd->txd_chim_size = 0;
2279	} else if (ISSET(txd->txd_flags, HVN_TXD_FLAG_DMAMAP)) {
2280		bus_dmamap_sync(sc->sc_dmat, txd->txd_dmap,
2281		    0, txd->txd_dmap->dm_mapsize,
2282		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
2283		bus_dmamap_unload(sc->sc_dmat, txd->txd_dmap);
2284		CLR(txd->txd_flags, HVN_TXD_FLAG_DMAMAP);
2285	}
2286
2287	if (txd->txd_buf != NULL) {
2288		m_freem(txd->txd_buf);
2289		txd->txd_buf = NULL;
2290	}
2291
2292	TAILQ_INSERT_TAIL(&txr->txr_list, txd, txd_entry);
2293	txr->txr_avail++;
2294	txr->txr_oactive = 0;
2295}
2296
2297static void
2298hvn_txd_gc(struct hvn_tx_ring *txr, struct hvn_tx_desc *txd)
2299{
2300
2301	KASSERTMSG(txd->txd_refs == 0 || txd->txd_refs == 1,
2302	    "invalid txd refs %d", txd->txd_refs);
2303
2304	/* Aggregated txds will be freed by their aggregating txd. */
2305	if (txd->txd_refs > 0 && !ISSET(txd->txd_flags, HVN_TXD_FLAG_ONAGG))
2306		hvn_txd_put(txr, txd);
2307}
2308
2309static void
2310hvn_txd_hold(struct hvn_tx_desc *txd)
2311{
2312
2313	/* 0->1 transition will never work */
2314	KASSERTMSG(txd->txd_refs > 0, "invalid txd refs %d", txd->txd_refs);
2315
2316	atomic_inc_uint(&txd->txd_refs);
2317}
2318
2319static void
2320hvn_txd_agg(struct hvn_tx_desc *agg_txd, struct hvn_tx_desc *txd)
2321{
2322
2323	KASSERTMSG(!ISSET(agg_txd->txd_flags, HVN_TXD_FLAG_ONAGG),
2324	    "recursive aggregation on aggregating txdesc");
2325	KASSERTMSG(!ISSET(txd->txd_flags, HVN_TXD_FLAG_ONAGG),
2326	    "already aggregated");
2327	KASSERTMSG(STAILQ_EMPTY(&txd->txd_agg_list),
2328	    "recursive aggregation on to-be-aggregated txdesc");
2329
2330	SET(txd->txd_flags, HVN_TXD_FLAG_ONAGG);
2331	STAILQ_INSERT_TAIL(&agg_txd->txd_agg_list, txd, txd_agg_entry);
2332}
2333
2334static int
2335hvn_tx_ring_pending(struct hvn_tx_ring *txr)
2336{
2337	int pending = 0;
2338
2339	mutex_enter(&txr->txr_lock);
2340	if (hvn_txd_peek(txr) != HVN_TX_DESC)
2341		pending = 1;
2342	mutex_exit(&txr->txr_lock);
2343
2344	return pending;
2345}
2346
2347static void
2348hvn_tx_ring_qflush(struct hvn_softc *sc, struct hvn_tx_ring *txr)
2349{
2350	struct mbuf *m;
2351
2352	while ((m = pcq_get(txr->txr_interq)) != NULL)
2353		m_freem(m);
2354}
2355
2356static int
2357hvn_get_lladdr(struct hvn_softc *sc, uint8_t *enaddr)
2358{
2359	size_t addrlen = ETHER_ADDR_LEN;
2360	int rv;
2361
2362	rv = hvn_rndis_query(sc, OID_802_3_PERMANENT_ADDRESS, enaddr, &addrlen);
2363	if (rv == 0 && addrlen != ETHER_ADDR_LEN)
2364		rv = -1;
2365	return rv;
2366}
2367
2368static void
2369hvn_update_link_status(struct hvn_softc *sc)
2370{
2371	struct ifnet *ifp = SC2IFP(sc);
2372	uint32_t state, old_link_state;
2373	size_t len = sizeof(state);
2374	int rv;
2375
2376	rv = hvn_rndis_query(sc, OID_GEN_MEDIA_CONNECT_STATUS, &state, &len);
2377	if (rv != 0 || len != sizeof(state))
2378		return;
2379
2380	old_link_state = sc->sc_link_state;
2381	sc->sc_link_state = (state == NDIS_MEDIA_STATE_CONNECTED) ?
2382	    LINK_STATE_UP : LINK_STATE_DOWN;
2383	if (old_link_state != sc->sc_link_state) {
2384		if_link_state_change(ifp, sc->sc_link_state);
2385	}
2386}
2387
2388static int
2389hvn_get_mtu(struct hvn_softc *sc, uint32_t *mtu)
2390{
2391	size_t mtusz = sizeof(*mtu);
2392	int rv;
2393
2394	rv = hvn_rndis_query(sc, OID_GEN_MAXIMUM_FRAME_SIZE, mtu, &mtusz);
2395	if (rv == 0 && mtusz != sizeof(*mtu))
2396		rv = -1;
2397	return rv;
2398}
2399
2400static int
2401hvn_channel_attach(struct hvn_softc *sc, struct vmbus_channel *chan)
2402{
2403	struct hvn_rx_ring *rxr;
2404	struct hvn_tx_ring *txr;
2405	int idx;
2406
2407	idx = chan->ch_subidx;
2408	if (idx < 0 || idx >= sc->sc_nrxr_inuse) {
2409		DPRINTF("%s: invalid sub-channel %u\n",
2410		    device_xname(sc->sc_dev), idx);
2411		return -1;
2412	}
2413
2414	rxr = &sc->sc_rxr[idx];
2415	rxr->rxr_chan = chan;
2416
2417	if (idx < sc->sc_ntxr_inuse) {
2418		txr = &sc->sc_txr[idx];
2419		txr->txr_chan = chan;
2420	}
2421
2422	/* Bind this channel to a proper CPU. */
2423	vmbus_channel_cpu_set(chan, HVN_RING_IDX2CPU(sc, idx));
2424
2425	chan->ch_flags &= ~CHF_BATCHED;
2426
2427	/* Associate our interrupt handler with the channel */
2428	if (vmbus_channel_open(chan,
2429	    HVN_RING_BUFSIZE - sizeof(struct vmbus_bufring), NULL, 0,
2430	    hvn_nvs_intr, rxr)) {
2431		DPRINTF("%s: failed to open channel\n",
2432		    device_xname(sc->sc_dev));
2433		return -1;
2434	}
2435
2436	return 0;
2437}
2438
2439static void
2440hvn_channel_detach(struct hvn_softc *sc, struct vmbus_channel *chan)
2441{
2442
2443	vmbus_channel_close_direct(chan);
2444}
2445
2446static void
2447hvn_channel_detach_all(struct hvn_softc *sc)
2448{
2449	struct vmbus_channel **subchans;
2450	int i, subchan_cnt = sc->sc_nrxr_inuse - 1;
2451
2452	if (subchan_cnt > 0) {
2453		/* Detach the sub-channels. */
2454		subchans = vmbus_subchannel_get(sc->sc_prichan, subchan_cnt);
2455		for (i = 0; i < subchan_cnt; i++)
2456			hvn_channel_detach(sc, subchans[i]);
2457		vmbus_subchannel_rel(subchans, subchan_cnt);
2458	}
2459
2460	/*
2461	 * Detach the primary channel, _after_ all sub-channels
2462	 * are detached.
2463	 */
2464	hvn_channel_detach(sc, sc->sc_prichan);
2465
2466	/* Wait for sub-channels to be destroyed, if any. */
2467	vmbus_subchannel_drain(sc->sc_prichan);
2468}
2469
2470static int
2471hvn_subchannel_attach(struct hvn_softc *sc)
2472{
2473	struct vmbus_channel **subchans;
2474	int subchan_cnt = sc->sc_nrxr_inuse - 1;
2475	int i, error = 0;
2476
2477	KASSERTMSG(subchan_cnt > 0, "no sub-channels");
2478
2479	/* Attach the sub-channels. */
2480	subchans = vmbus_subchannel_get(sc->sc_prichan, subchan_cnt);
2481	for (i = 0; i < subchan_cnt; ++i) {
2482		int error1;
2483
2484		error1 = hvn_channel_attach(sc, subchans[i]);
2485		if (error1) {
2486			error = error1;
2487			/* Move on; all channels will be detached later. */
2488		}
2489	}
2490	vmbus_subchannel_rel(subchans, subchan_cnt);
2491
2492	if (error) {
2493		aprint_error_dev(sc->sc_dev,
2494		    "sub-channels attach failed: %d\n", error);
2495		return error;
2496	}
2497
2498	aprint_debug_dev(sc->sc_dev, "%d sub-channels attached\n",
2499	    subchan_cnt);
2500	return 0;
2501}
2502
2503static int
2504hvn_synth_alloc_subchannels(struct hvn_softc *sc, int *nsubch)
2505{
2506	struct vmbus_channel **subchans;
2507	int error, nchan, rxr_cnt;
2508
2509	nchan = *nsubch + 1;
2510	if (nchan < 2) {
2511		/* Multiple RX/TX rings are not requested. */
2512		*nsubch = 0;
2513		return 0;
2514	}
2515
2516	/*
2517	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
2518	 * table entries.
2519	 */
2520	if (hvn_get_rsscaps(sc, &rxr_cnt)) {
2521		/* No RSS. */
2522		*nsubch = 0;
2523		return 0;
2524	}
2525
2526	aprint_debug_dev(sc->sc_dev, "RX rings offered %u, requested %d\n",
2527	    rxr_cnt, nchan);
2528
2529	if (nchan > rxr_cnt)
2530		nchan = rxr_cnt;
2531	if (nchan == 1) {
2532		aprint_debug_dev(sc->sc_dev,
2533		    "only 1 channel is supported, no vRSS\n");
2534		*nsubch = 0;
2535		return 0;
2536	}
2537
2538	*nsubch = nchan - 1;
2539	error = hvn_nvs_alloc_subchannels(sc, nsubch);
2540	if (error || *nsubch == 0) {
2541		/* Failed to allocate sub-channels. */
2542		*nsubch = 0;
2543		return 0;
2544	}
2545
2546	/*
2547	 * Wait for all sub-channels to become ready before moving on.
2548	 */
2549	subchans = vmbus_subchannel_get(sc->sc_prichan, *nsubch);
2550	vmbus_subchannel_rel(subchans, *nsubch);
2551	return 0;
2552}
2553
2554static int
2555hvn_synth_attachable(const struct hvn_softc *sc)
2556{
2557#if 0
2558	const struct hvn_rx_ring *rxr;
2559	int i;
2560
2561	for (i = 0; i < sc->sc_nrxr; i++) {
2562		rxr = &sc->sc_rxr[i];
2563		if (rxr->rxr_flags)
2564			return 0;
2565	}
2566#endif
2567	return 1;
2568}
2569
2570/*
2571 * Make sure that the RX filter is zero after the successful
2572 * RNDIS initialization.
2573 *
2574 * NOTE:
2575 * Under certain conditions on certain versions of Hyper-V,
2576 * the RNDIS rxfilter is _not_ zero on the hypervisor side
2577 * after the successful RNDIS initialization, which breaks
2578 * the assumption of any following code (well, it breaks the
2579 * RNDIS API contract actually).  Clear the RNDIS rxfilter
2580 * explicitly, drain packets sneaking through, and drain the
2581 * interrupt taskqueues scheduled due to the stealth packets.
2582 */
2583static void
2584hvn_init_fixat(struct hvn_softc *sc, int nchan)
2585{
2586
2587	hvn_disable_rx(sc);
2588	hvn_drain_rxtx(sc, nchan);
2589}
2590
2591static void
2592hvn_set_txagg(struct hvn_softc *sc)
2593{
2594	struct hvn_tx_ring *txr;
2595	uint32_t size, pkts;
2596	int i;
2597
2598	/*
2599	 * Setup aggregation size.
2600	 */
2601	if (sc->sc_agg_size < 0)
2602		size = UINT32_MAX;
2603	else
2604		size = sc->sc_agg_size;
2605
2606	if (size > sc->sc_rndis_agg_size)
2607		size = sc->sc_rndis_agg_size;
2608
2609	/* NOTE: We only aggregate packets using chimney sending buffers. */
2610	if (size > (uint32_t)sc->sc_chim_szmax)
2611		size = sc->sc_chim_szmax;
2612
2613	if (size <= 2 * HVN_PKTSIZE_MIN(sc->sc_rndis_agg_align)) {
2614		/* Disable */
2615		size = 0;
2616		pkts = 0;
2617		goto done;
2618	}
2619
2620	/* NOTE: Type of the per TX ring setting is 'int'. */
2621	if (size > INT_MAX)
2622		size = INT_MAX;
2623
2624	/*
2625	 * Setup aggregation packet count.
2626	 */
2627	if (sc->sc_agg_pkts < 0)
2628		pkts = UINT32_MAX;
2629	else
2630		pkts = sc->sc_agg_pkts;
2631
2632	if (pkts > sc->sc_rndis_agg_pkts)
2633		pkts = sc->sc_rndis_agg_pkts;
2634
2635	if (pkts <= 1) {
2636		/* Disable */
2637		size = 0;
2638		pkts = 0;
2639		goto done;
2640	}
2641
2642	/* NOTE: Type of the per TX ring setting is 'short'. */
2643	if (pkts > SHRT_MAX)
2644		pkts = SHRT_MAX;
2645
2646done:
2647	/* NOTE: Type of the per TX ring setting is 'short'. */
2648	if (sc->sc_rndis_agg_align > SHRT_MAX) {
2649		/* Disable */
2650		size = 0;
2651		pkts = 0;
2652	}
2653
2654	aprint_verbose_dev(sc->sc_dev,
2655	    "TX aggregate size %u, pkts %u, align %u\n",
2656	    size, pkts, sc->sc_rndis_agg_align);
2657
2658	for (i = 0; i < sc->sc_ntxr_inuse; ++i) {
2659		txr = &sc->sc_txr[i];
2660
2661		mutex_enter(&txr->txr_lock);
2662		txr->txr_agg_szmax = size;
2663		txr->txr_agg_pktmax = pkts;
2664		txr->txr_agg_align = sc->sc_rndis_agg_align;
2665		mutex_exit(&txr->txr_lock);
2666	}
2667}
2668
2669static int
2670hvn_synth_attach(struct hvn_softc *sc, int mtu)
2671{
2672	uint8_t rss_key[RSS_KEYSIZE];
2673	uint32_t old_caps;
2674	int nchan = 1, nsubch;
2675	int i, error;
2676
2677	if (!hvn_synth_attachable(sc))
2678		return ENXIO;
2679
2680	/* Save capabilities for later verification. */
2681	old_caps = sc->sc_caps;
2682	sc->sc_caps = 0;
2683
2684	/* Clear RSS stuffs. */
2685	sc->sc_rss_ind_size = 0;
2686	sc->sc_rss_hash = 0;
2687	sc->sc_rss_hcap = 0;
2688
2689	/*
2690	 * Attach the primary channel _before_ attaching NVS and RNDIS.
2691	 */
2692	error = hvn_channel_attach(sc, sc->sc_prichan);
2693	if (error) {
2694		aprint_error_dev(sc->sc_dev,
2695		    "failed to attach primary channel\n");
2696		goto failed;
2697	}
2698
2699	/*
2700	 * Attach NVS.
2701	 */
2702	error = hvn_nvs_attach(sc, mtu);
2703	if (error) {
2704		aprint_error_dev(sc->sc_dev, "failed to init NVSP\n");
2705		goto detach_channel;
2706	}
2707
2708	/*
2709	 * Attach RNDIS _after_ NVS is attached.
2710	 */
2711	error = hvn_rndis_attach(sc, mtu);
2712	if (error) {
2713		aprint_error_dev(sc->sc_dev, "failed to init RNDIS\n");
2714		goto detach_nvs;
2715	}
2716
2717	error = hvn_set_capabilities(sc, mtu);
2718	if (error) {
2719		aprint_error_dev(sc->sc_dev, "failed to setup offloading\n");
2720		goto detach_rndis;
2721	}
2722
2723	if ((sc->sc_flags & HVN_SCF_ATTACHED) && old_caps != sc->sc_caps) {
2724		device_printf(sc->sc_dev, "caps mismatch "
2725		    "old 0x%08x, new 0x%08x\n", old_caps, sc->sc_caps);
2726		error = ENXIO;
2727		goto detach_rndis;
2728	}
2729
2730	/*
2731	 * Allocate sub-channels for multi-TX/RX rings.
2732	 *
2733	 * NOTE:
2734	 * The # of RX rings that can be used is equivalent to the # of
2735	 * channels to be requested.
2736	 */
2737	nsubch = sc->sc_nrxr - 1;
2738	error = hvn_synth_alloc_subchannels(sc, &nsubch);
2739	if (error) {
2740		aprint_error_dev(sc->sc_dev,
2741		    "failed to allocate sub channels\n");
2742		goto detach_synth;
2743	}
2744
2745	/*
2746	 * Set the # of TX/RX rings that could be used according to
2747	 * the # of channels that NVS offered.
2748	 */
2749	nchan = nsubch + 1;
2750	hvn_set_ring_inuse(sc, nchan);
2751
2752	if (nchan > 1) {
2753		/*
2754		 * Attach the sub-channels.
2755		 *
2756		 * NOTE: hvn_set_ring_inuse() _must_ have been called.
2757		 */
2758		error = hvn_subchannel_attach(sc);
2759		if (error) {
2760			aprint_error_dev(sc->sc_dev,
2761			    "failed to attach sub channels\n");
2762			goto detach_synth;
2763		}
2764
2765		/*
2766		 * Configure RSS key and indirect table _after_ all sub-channels
2767		 * are attached.
2768		 */
2769		if (!(sc->sc_flags & HVN_SCF_HAS_RSSKEY)) {
2770			/* Set the default RSS key. */
2771			CTASSERT(sizeof(sc->sc_rss.rss_key) == sizeof(rss_key));
2772			rss_getkey(rss_key);
2773			memcpy(&sc->sc_rss.rss_key, rss_key,
2774			    sizeof(sc->sc_rss.rss_key));
2775			sc->sc_flags |= HVN_SCF_HAS_RSSKEY;
2776		}
2777
2778		if (!(sc->sc_flags & HVN_SCF_HAS_RSSIND)) {
2779			/* Setup RSS indirect table in round-robin fashion. */
2780			for (i = 0; i < NDIS_HASH_INDCNT; i++) {
2781				sc->sc_rss.rss_ind[i] = i % nchan;
2782			}
2783			sc->sc_flags |= HVN_SCF_HAS_RSSIND;
2784		} else {
2785			/*
2786			 * # of usable channels may be changed, so we have to
2787			 * make sure that all entries in RSS indirect table
2788			 * are valid.
2789			 *
2790			 * NOTE: hvn_set_ring_inuse() _must_ have been called.
2791			 */
2792			hvn_fixup_rss_ind(sc);
2793		}
2794
2795		sc->sc_rss_hash = sc->sc_rss_hcap;
2796		error = hvn_set_rss(sc, NDIS_RSS_FLAG_NONE);
2797		if (error) {
2798			aprint_error_dev(sc->sc_dev, "failed to setup RSS\n");
2799			goto detach_synth;
2800		}
2801	}
2802
2803	/*
2804	 * Fixup transmission aggregation setup.
2805	 */
2806	hvn_set_txagg(sc);
2807	hvn_init_fixat(sc, nchan);
2808	return 0;
2809
2810detach_synth:
2811	hvn_init_fixat(sc, nchan);
2812	hvn_synth_detach(sc);
2813	return error;
2814
2815detach_rndis:
2816	hvn_init_fixat(sc, nchan);
2817	hvn_rndis_detach(sc);
2818detach_nvs:
2819	hvn_nvs_detach(sc);
2820detach_channel:
2821	hvn_channel_detach(sc, sc->sc_prichan);
2822failed:
2823	/* Restore old capabilities. */
2824	sc->sc_caps = old_caps;
2825	return error;
2826}
2827
2828static void
2829hvn_synth_detach(struct hvn_softc *sc)
2830{
2831
2832	/* Detach the RNDIS first. */
2833	hvn_rndis_detach(sc);
2834
2835	/* Detach NVS. */
2836	hvn_nvs_detach(sc);
2837
2838	/* Detach all of the channels. */
2839	hvn_channel_detach_all(sc);
2840
2841	if (sc->sc_prichan->ch_sc->sc_proto >= VMBUS_VERSION_WIN10 &&
2842	    sc->sc_rx_hndl) {
2843		/*
2844		 * Host is post-Win2016, disconnect RXBUF from primary channel
2845		 * here.
2846		 */
2847		vmbus_handle_free(sc->sc_prichan, sc->sc_rx_hndl);
2848		sc->sc_rx_hndl = 0;
2849	}
2850
2851	if (sc->sc_prichan->ch_sc->sc_proto >= VMBUS_VERSION_WIN10 &&
2852	    sc->sc_chim_hndl) {
2853		/*
2854		 * Host is post-Win2016, disconnect chimney sending buffer
2855		 * from primary channel here.
2856		 */
2857		vmbus_handle_free(sc->sc_prichan, sc->sc_chim_hndl);
2858		sc->sc_chim_hndl = 0;
2859	}
2860}
2861
2862static void
2863hvn_set_ring_inuse(struct hvn_softc *sc, int ring_cnt)
2864{
2865
2866	if (sc->sc_ntxr > ring_cnt)
2867		sc->sc_ntxr_inuse = ring_cnt;
2868	else
2869		sc->sc_ntxr_inuse = sc->sc_ntxr;
2870	sc->sc_nrxr_inuse = ring_cnt;
2871}
2872
2873static void
2874hvn_channel_drain(struct hvn_softc *sc, struct vmbus_channel *chan)
2875{
2876	struct hvn_rx_ring *rxr;
2877	int i, s;
2878
2879	for (rxr = NULL, i = 0; i < sc->sc_nrxr_inuse; i++) {
2880		rxr = &sc->sc_rxr[i];
2881		if (rxr->rxr_chan == chan)
2882			break;
2883	}
2884	KASSERT(i < sc->sc_nrxr_inuse);
2885
2886	/*
2887	 * NOTE:
2888	 * The TX bufring will not be drained by the hypervisor,
2889	 * if the primary channel is revoked.
2890	 */
2891	while (!vmbus_channel_rx_empty(chan) ||
2892	    (!vmbus_channel_is_revoked(sc->sc_prichan) &&
2893	     !vmbus_channel_tx_empty(chan))) {
2894		DELAY(20);
2895		s = splnet();
2896		hvn_nvs_intr1(rxr, sc->sc_tx_process_limit,
2897		    sc->sc_rx_process_limit);
2898		splx(s);
2899	}
2900
2901	mutex_enter(&rxr->rxr_onwork_lock);
2902	while (rxr->rxr_onlist || rxr->rxr_onproc)
2903		cv_wait(&rxr->rxr_onwork_cv, &rxr->rxr_onwork_lock);
2904	mutex_exit(&rxr->rxr_onwork_lock);
2905}
2906
2907static void
2908hvn_disable_rx(struct hvn_softc *sc)
2909{
2910
2911	/*
2912	 * Disable RX by clearing RX filter forcefully.
2913	 */
2914	(void)hvn_rndis_close(sc);	/* ignore error */
2915
2916	/*
2917	 * Give RNDIS enough time to flush all pending data packets.
2918	 */
2919	DELAY(200);
2920}
2921
2922static void
2923hvn_drain_rxtx(struct hvn_softc *sc, int nchan)
2924{
2925	struct vmbus_channel **subchans = NULL;
2926	int i, nsubch;
2927
2928	/*
2929	 * Drain RX/TX bufrings and interrupts.
2930	 */
2931	nsubch = nchan - 1;
2932	if (nsubch > 0)
2933		subchans = vmbus_subchannel_get(sc->sc_prichan, nsubch);
2934
2935	if (subchans != NULL) {
2936		for (i = 0; i < nsubch; ++i)
2937			hvn_channel_drain(sc, subchans[i]);
2938	}
2939	hvn_channel_drain(sc, sc->sc_prichan);
2940
2941	if (subchans != NULL)
2942		vmbus_subchannel_rel(subchans, nsubch);
2943}
2944
2945static void
2946hvn_suspend_data(struct hvn_softc *sc)
2947{
2948	struct hvn_tx_ring *txr;
2949	int i, s;
2950
2951	/*
2952	 * Suspend TX.
2953	 */
2954	for (i = 0; i < sc->sc_ntxr_inuse; i++) {
2955		txr = &sc->sc_txr[i];
2956
2957		mutex_enter(&txr->txr_lock);
2958		txr->txr_suspended = 1;
2959		mutex_exit(&txr->txr_lock);
2960		/* No one is able send more packets now. */
2961
2962		/*
2963		 * Wait for all pending sends to finish.
2964		 *
2965		 * NOTE:
2966		 * We will _not_ receive all pending send-done, if the
2967		 * primary channel is revoked.
2968		 */
2969		while (hvn_tx_ring_pending(txr) &&
2970		    !vmbus_channel_is_revoked(sc->sc_prichan)) {
2971			DELAY(20);
2972			s = splnet();
2973			hvn_nvs_intr1(txr->txr_rxr, sc->sc_tx_process_limit,
2974			    sc->sc_rx_process_limit);
2975			splx(s);
2976		}
2977	}
2978
2979	/*
2980	 * Disable RX.
2981	 */
2982	hvn_disable_rx(sc);
2983
2984	/*
2985	 * Drain RX/TX.
2986	 */
2987	hvn_drain_rxtx(sc, sc->sc_nrxr_inuse);
2988}
2989
2990static void
2991hvn_suspend_mgmt(struct hvn_softc *sc)
2992{
2993
2994	sc->sc_link_suspend = true;
2995	callout_halt(&sc->sc_link_tmout, NULL);
2996
2997	/* Drain link state task */
2998	mutex_enter(&sc->sc_link_lock);
2999	for (;;) {
3000		if (!sc->sc_link_onproc)
3001			break;
3002		mutex_exit(&sc->sc_link_lock);
3003		DELAY(20);
3004		mutex_enter(&sc->sc_link_lock);
3005	}
3006	mutex_exit(&sc->sc_link_lock);
3007}
3008
3009static void
3010hvn_suspend(struct hvn_softc *sc)
3011{
3012	struct ifnet *ifp = SC2IFP(sc);
3013
3014	if (ifp->if_flags & IFF_RUNNING)
3015		hvn_suspend_data(sc);
3016	hvn_suspend_mgmt(sc);
3017}
3018
3019static void
3020hvn_resume_tx(struct hvn_softc *sc, int ring_cnt)
3021{
3022	struct hvn_tx_ring *txr;
3023	int i;
3024
3025	for (i = 0; i < ring_cnt; i++) {
3026		txr = &sc->sc_txr[i];
3027		mutex_enter(&txr->txr_lock);
3028		txr->txr_suspended = 0;
3029		mutex_exit(&txr->txr_lock);
3030	}
3031}
3032
3033static void
3034hvn_resume_data(struct hvn_softc *sc)
3035{
3036	struct ifnet *ifp = SC2IFP(sc);
3037	struct hvn_tx_ring *txr;
3038	int i;
3039
3040	/*
3041	 * Re-enable RX.
3042	 */
3043	hvn_rndis_open(sc);
3044
3045	/*
3046	 * Make sure to clear suspend status on "all" TX rings,
3047	 * since sc_ntxr_inuse can be changed after hvn_suspend_data().
3048	 */
3049	hvn_resume_tx(sc, sc->sc_ntxr);
3050
3051	/*
3052	 * Flush unused mbuf, since sc_ntxr_inuse may be reduced.
3053	 */
3054	for (i = sc->sc_ntxr_inuse; i < sc->sc_ntxr; i++)
3055		hvn_tx_ring_qflush(sc, &sc->sc_txr[i]);
3056
3057	/*
3058	 * Kick start TX.
3059	 */
3060	for (i = 0; i < sc->sc_ntxr_inuse; i++) {
3061		txr = &sc->sc_txr[i];
3062		mutex_enter(&txr->txr_lock);
3063		txr->txr_oactive = 0;
3064
3065		/* ALTQ */
3066		if (txr->txr_id == 0)
3067			if_schedule_deferred_start(ifp);
3068		softint_schedule(txr->txr_si);
3069		mutex_exit(&txr->txr_lock);
3070	}
3071}
3072
3073static void
3074hvn_resume_mgmt(struct hvn_softc *sc)
3075{
3076
3077	sc->sc_link_suspend = false;
3078	hvn_link_event(sc, HVN_LINK_EV_RESUME_NETWORK);
3079}
3080
3081static void
3082hvn_resume(struct hvn_softc *sc)
3083{
3084	struct ifnet *ifp = SC2IFP(sc);
3085
3086	if (ifp->if_flags & IFF_RUNNING)
3087		hvn_resume_data(sc);
3088	hvn_resume_mgmt(sc);
3089}
3090
3091static int
3092hvn_nvs_init(struct hvn_softc *sc)
3093{
3094
3095	mutex_init(&sc->sc_nvsrsp_lock, MUTEX_DEFAULT, IPL_NET);
3096	cv_init(&sc->sc_nvsrsp_cv, "nvsrspcv");
3097
3098	return 0;
3099}
3100
3101static void
3102hvn_nvs_destroy(struct hvn_softc *sc)
3103{
3104
3105	mutex_destroy(&sc->sc_nvsrsp_lock);
3106	cv_destroy(&sc->sc_nvsrsp_cv);
3107}
3108
3109static int
3110hvn_nvs_doinit(struct hvn_softc *sc, uint32_t proto)
3111{
3112	struct hvn_nvs_init cmd;
3113	struct hvn_nvs_init_resp *rsp;
3114	uint64_t tid;
3115	int error;
3116
3117	memset(&cmd, 0, sizeof(cmd));
3118	cmd.nvs_type = HVN_NVS_TYPE_INIT;
3119	cmd.nvs_ver_min = cmd.nvs_ver_max = proto;
3120
3121	tid = atomic_inc_uint_nv(&sc->sc_nvstid);
3122	mutex_enter(&sc->sc_nvsrsp_lock);
3123	error = hvn_nvs_cmd(sc, &cmd, sizeof(cmd), tid, 0);
3124	if (error == 0) {
3125		rsp = (struct hvn_nvs_init_resp *)&sc->sc_nvsrsp;
3126		if (rsp->nvs_status != HVN_NVS_STATUS_OK)
3127			error = EINVAL;
3128	}
3129	mutex_exit(&sc->sc_nvsrsp_lock);
3130
3131	return error;
3132}
3133
3134static int
3135hvn_nvs_conf_ndis(struct hvn_softc *sc, int mtu)
3136{
3137	struct hvn_nvs_ndis_conf cmd;
3138	uint64_t tid;
3139	int error;
3140
3141	memset(&cmd, 0, sizeof(cmd));
3142	cmd.nvs_type = HVN_NVS_TYPE_NDIS_CONF;
3143	cmd.nvs_mtu = mtu + ETHER_HDR_LEN;
3144	cmd.nvs_caps = HVN_NVS_NDIS_CONF_VLAN;
3145
3146	tid = atomic_inc_uint_nv(&sc->sc_nvstid);
3147	mutex_enter(&sc->sc_nvsrsp_lock);
3148	/* NOTE: No response. */
3149	error = hvn_nvs_cmd(sc, &cmd, sizeof(cmd), tid, 0);
3150	mutex_exit(&sc->sc_nvsrsp_lock);
3151
3152	if (error == 0)
3153		sc->sc_caps |= HVN_CAPS_MTU | HVN_CAPS_VLAN;
3154	return error;
3155}
3156
3157static int
3158hvn_nvs_init_ndis(struct hvn_softc *sc)
3159{
3160	struct hvn_nvs_ndis_init cmd;
3161	uint64_t tid;
3162	int error;
3163
3164	memset(&cmd, 0, sizeof(cmd));
3165	cmd.nvs_type = HVN_NVS_TYPE_NDIS_INIT;
3166	cmd.nvs_ndis_major = (sc->sc_ndisver & 0xffff0000) >> 16;
3167	cmd.nvs_ndis_minor = sc->sc_ndisver & 0x0000ffff;
3168
3169	tid = atomic_inc_uint_nv(&sc->sc_nvstid);
3170	mutex_enter(&sc->sc_nvsrsp_lock);
3171	/* NOTE: No response. */
3172	error = hvn_nvs_cmd(sc, &cmd, sizeof(cmd), tid, 0);
3173	mutex_exit(&sc->sc_nvsrsp_lock);
3174
3175	return error;
3176}
3177
3178static int
3179hvn_nvs_attach(struct hvn_softc *sc, int mtu)
3180{
3181	static const uint32_t protos[] = {
3182		HVN_NVS_PROTO_VERSION_5,
3183		HVN_NVS_PROTO_VERSION_4,
3184		HVN_NVS_PROTO_VERSION_2,
3185		HVN_NVS_PROTO_VERSION_1
3186	};
3187	int i;
3188
3189	if (hyperv_ver_major >= 10)
3190		sc->sc_caps |= HVN_CAPS_UDPHASH;
3191
3192	/*
3193	 * Initialize NVS.
3194	 */
3195	if (sc->sc_flags & HVN_SCF_ATTACHED) {
3196		/*
3197		 * NVS version and NDIS version MUST NOT be changed.
3198		 */
3199		DPRINTF("%s: reinit NVS version %#x, NDIS version %u.%u\n",
3200		    device_xname(sc->sc_dev), sc->sc_proto,
3201		    (sc->sc_ndisver >> 16), sc->sc_ndisver & 0xffff);
3202
3203		if (hvn_nvs_doinit(sc, sc->sc_proto)) {
3204			DPRINTF("%s: failed to reinit NVSP version %#x\n",
3205			    device_xname(sc->sc_dev), sc->sc_proto);
3206			return -1;
3207		}
3208	} else {
3209		/*
3210		 * Find the supported NVS version and set NDIS version
3211		 * accordingly.
3212		 */
3213		for (i = 0; i < __arraycount(protos); i++) {
3214			if (hvn_nvs_doinit(sc, protos[i]) == 0)
3215				break;
3216		}
3217		if (i == __arraycount(protos)) {
3218			DPRINTF("%s: failed to negotiate NVSP version\n",
3219			    device_xname(sc->sc_dev));
3220			return -1;
3221		}
3222
3223		sc->sc_proto = protos[i];
3224		if (sc->sc_proto <= HVN_NVS_PROTO_VERSION_4)
3225			sc->sc_ndisver = NDIS_VERSION_6_1;
3226		else
3227			sc->sc_ndisver = NDIS_VERSION_6_30;
3228
3229		DPRINTF("%s: NVS version %#x, NDIS version %u.%u\n",
3230		    device_xname(sc->sc_dev), sc->sc_proto,
3231		    (sc->sc_ndisver >> 16), sc->sc_ndisver & 0xffff);
3232	}
3233
3234	if (sc->sc_proto >= HVN_NVS_PROTO_VERSION_5)
3235		sc->sc_caps |= HVN_CAPS_HASHVAL;
3236
3237	if (sc->sc_proto >= HVN_NVS_PROTO_VERSION_2) {
3238		/*
3239		 * Configure NDIS before initializing it.
3240		 */
3241		if (hvn_nvs_conf_ndis(sc, mtu))
3242			return -1;
3243	}
3244
3245	/*
3246	 * Initialize NDIS.
3247	 */
3248	if (hvn_nvs_init_ndis(sc))
3249		return -1;
3250
3251	/*
3252	 * Connect RXBUF.
3253	 */
3254	if (hvn_nvs_connect_rxbuf(sc))
3255		return -1;
3256
3257	/*
3258	 * Connect chimney sending buffer.
3259	 */
3260	if (hvn_nvs_connect_chim(sc))
3261		return -1;
3262
3263	return 0;
3264}
3265
3266static int
3267hvn_nvs_connect_rxbuf(struct hvn_softc *sc)
3268{
3269	struct hvn_nvs_rxbuf_conn cmd;
3270	struct hvn_nvs_rxbuf_conn_resp *rsp;
3271	uint64_t tid;
3272
3273	if (vmbus_handle_alloc(sc->sc_prichan, &sc->sc_rx_dma, sc->sc_rx_size,
3274	    &sc->sc_rx_hndl)) {
3275		DPRINTF("%s: failed to obtain a PA handle\n",
3276		    device_xname(sc->sc_dev));
3277		return -1;
3278	}
3279
3280	memset(&cmd, 0, sizeof(cmd));
3281	cmd.nvs_type = HVN_NVS_TYPE_RXBUF_CONN;
3282	cmd.nvs_gpadl = sc->sc_rx_hndl;
3283	cmd.nvs_sig = HVN_NVS_RXBUF_SIG;
3284
3285	tid = atomic_inc_uint_nv(&sc->sc_nvstid);
3286	mutex_enter(&sc->sc_nvsrsp_lock);
3287	if (hvn_nvs_cmd(sc, &cmd, sizeof(cmd), tid, 0))
3288		goto errout;
3289
3290	rsp = (struct hvn_nvs_rxbuf_conn_resp *)&sc->sc_nvsrsp;
3291	if (rsp->nvs_status != HVN_NVS_STATUS_OK) {
3292		DPRINTF("%s: failed to set up the Rx ring\n",
3293		    device_xname(sc->sc_dev));
3294		goto errout;
3295	}
3296
3297	SET(sc->sc_flags, HVN_SCF_RXBUF_CONNECTED);
3298
3299	if (rsp->nvs_nsect > 1) {
3300		DPRINTF("%s: invalid number of Rx ring sections: %u\n",
3301		    device_xname(sc->sc_dev), rsp->nvs_nsect);
3302		goto errout;
3303	}
3304	mutex_exit(&sc->sc_nvsrsp_lock);
3305
3306	return 0;
3307
3308 errout:
3309	mutex_exit(&sc->sc_nvsrsp_lock);
3310	hvn_nvs_disconnect_rxbuf(sc);
3311	return -1;
3312}
3313
3314static int
3315hvn_nvs_disconnect_rxbuf(struct hvn_softc *sc)
3316{
3317	struct hvn_nvs_rxbuf_disconn cmd;
3318	uint64_t tid;
3319	int s, error;
3320
3321	if (ISSET(sc->sc_flags, HVN_SCF_RXBUF_CONNECTED)) {
3322		memset(&cmd, 0, sizeof(cmd));
3323		cmd.nvs_type = HVN_NVS_TYPE_RXBUF_DISCONN;
3324		cmd.nvs_sig = HVN_NVS_RXBUF_SIG;
3325
3326		tid = atomic_inc_uint_nv(&sc->sc_nvstid);
3327		mutex_enter(&sc->sc_nvsrsp_lock);
3328		error = hvn_nvs_cmd(sc, &cmd, sizeof(cmd), tid,
3329		    HVN_NVS_CMD_NORESP);
3330		if (error) {
3331			device_printf(sc->sc_dev,
3332			    "failed to send rxbuf disconn: %d", error);
3333		}
3334		CLR(sc->sc_flags, HVN_SCF_RXBUF_CONNECTED);
3335		mutex_exit(&sc->sc_nvsrsp_lock);
3336
3337		/*
3338		 * Wait for the hypervisor to receive this NVS request.
3339		 *
3340		 * NOTE:
3341		 * The TX bufring will not be drained by the hypervisor,
3342		 * if the primary channel is revoked.
3343		 */
3344		while (!vmbus_channel_tx_empty(sc->sc_prichan) &&
3345		    !vmbus_channel_is_revoked(sc->sc_prichan)) {
3346			DELAY(20);
3347			s = splnet();
3348			hvn_nvs_intr1(&sc->sc_rxr[0], sc->sc_tx_process_limit,
3349			    sc->sc_rx_process_limit);
3350			splx(s);
3351		}
3352		/*
3353		 * Linger long enough for NVS to disconnect RXBUF.
3354		 */
3355		DELAY(200);
3356	}
3357
3358	if (sc->sc_prichan->ch_sc->sc_proto < VMBUS_VERSION_WIN10 &&
3359	    sc->sc_rx_hndl) {
3360		/*
3361		 * Disconnect RXBUF from primary channel.
3362		 */
3363		vmbus_handle_free(sc->sc_prichan, sc->sc_rx_hndl);
3364		sc->sc_rx_hndl = 0;
3365	}
3366
3367	return 0;
3368}
3369
3370static int
3371hvn_nvs_connect_chim(struct hvn_softc *sc)
3372{
3373	struct hvn_nvs_chim_conn cmd;
3374	const struct hvn_nvs_chim_conn_resp *rsp;
3375	uint64_t tid;
3376
3377	mutex_init(&sc->sc_chim_bmap_lock, MUTEX_DEFAULT, IPL_NET);
3378
3379	/*
3380	 * Connect chimney sending buffer GPADL to the primary channel.
3381	 *
3382	 * NOTE:
3383	 * Only primary channel has chimney sending buffer connected to it.
3384	 * Sub-channels just share this chimney sending buffer.
3385	 */
3386	if (vmbus_handle_alloc(sc->sc_prichan, &sc->sc_chim_dma, HVN_CHIM_SIZE,
3387	    &sc->sc_chim_hndl)) {
3388		DPRINTF("%s: failed to obtain a PA handle for chimney\n",
3389		    device_xname(sc->sc_dev));
3390		return -1;
3391	}
3392
3393	memset(&cmd, 0, sizeof(cmd));
3394	cmd.nvs_type = HVN_NVS_TYPE_CHIM_CONN;
3395	cmd.nvs_gpadl = sc->sc_chim_hndl;
3396	cmd.nvs_sig = HVN_NVS_CHIM_SIG;
3397
3398	tid = atomic_inc_uint_nv(&sc->sc_nvstid);
3399	mutex_enter(&sc->sc_nvsrsp_lock);
3400	if (hvn_nvs_cmd(sc, &cmd, sizeof(cmd), tid, 0))
3401		goto errout;
3402
3403	rsp = (struct hvn_nvs_chim_conn_resp *)&sc->sc_nvsrsp;
3404	if (rsp->nvs_status != HVN_NVS_STATUS_OK) {
3405		DPRINTF("%s: failed to set up chimney sending buffer\n",
3406		    device_xname(sc->sc_dev));
3407		goto errout;
3408	}
3409
3410	if (rsp->nvs_sectsz == 0 ||
3411	    (rsp->nvs_sectsz % sizeof(uint32_t)) != 0) {
3412		/*
3413		 * Can't use chimney sending buffer; done!
3414		 */
3415		if (rsp->nvs_sectsz == 0) {
3416			device_printf(sc->sc_dev,
3417			    "zero chimney sending buffer section size\n");
3418		} else {
3419			device_printf(sc->sc_dev,
3420			    "misaligned chimney sending buffers,"
3421			    " section size: %d", rsp->nvs_sectsz);
3422		}
3423		sc->sc_chim_szmax = 0;
3424		sc->sc_chim_cnt = 0;
3425	} else {
3426		sc->sc_chim_szmax = rsp->nvs_sectsz;
3427		sc->sc_chim_cnt = HVN_CHIM_SIZE / sc->sc_chim_szmax;
3428	}
3429
3430	if (sc->sc_chim_szmax > 0) {
3431		if ((HVN_CHIM_SIZE % sc->sc_chim_szmax) != 0) {
3432			device_printf(sc->sc_dev,
3433			    "chimney sending sections are not properly "
3434			    "aligned\n");
3435		}
3436		if ((sc->sc_chim_cnt % LONG_BIT) != 0) {
3437			device_printf(sc->sc_dev,
3438			    "discard %d chimney sending sections\n",
3439			    sc->sc_chim_cnt % LONG_BIT);
3440		}
3441
3442		sc->sc_chim_bmap_cnt = sc->sc_chim_cnt / LONG_BIT;
3443		sc->sc_chim_bmap = kmem_zalloc(sc->sc_chim_bmap_cnt *
3444		    sizeof(u_long), KM_SLEEP);
3445	}
3446
3447	/* Done! */
3448	SET(sc->sc_flags, HVN_SCF_CHIM_CONNECTED);
3449
3450	aprint_verbose_dev(sc->sc_dev, "chimney sending buffer %d/%d\n",
3451	    sc->sc_chim_szmax, sc->sc_chim_cnt);
3452
3453	mutex_exit(&sc->sc_nvsrsp_lock);
3454
3455	return 0;
3456
3457errout:
3458	mutex_exit(&sc->sc_nvsrsp_lock);
3459	hvn_nvs_disconnect_chim(sc);
3460	return -1;
3461}
3462
3463static int
3464hvn_nvs_disconnect_chim(struct hvn_softc *sc)
3465{
3466	struct hvn_nvs_chim_disconn cmd;
3467	uint64_t tid;
3468	int s, error;
3469
3470	if (ISSET(sc->sc_flags, HVN_SCF_CHIM_CONNECTED)) {
3471		memset(&cmd, 0, sizeof(cmd));
3472		cmd.nvs_type = HVN_NVS_TYPE_CHIM_DISCONN;
3473		cmd.nvs_sig = HVN_NVS_CHIM_SIG;
3474
3475		tid = atomic_inc_uint_nv(&sc->sc_nvstid);
3476		mutex_enter(&sc->sc_nvsrsp_lock);
3477		error = hvn_nvs_cmd(sc, &cmd, sizeof(cmd), tid,
3478		    HVN_NVS_CMD_NORESP);
3479		if (error) {
3480			device_printf(sc->sc_dev,
3481			    "failed to send chim disconn: %d", error);
3482		}
3483		CLR(sc->sc_flags, HVN_SCF_CHIM_CONNECTED);
3484		mutex_exit(&sc->sc_nvsrsp_lock);
3485
3486		/*
3487		 * Wait for the hypervisor to receive this NVS request.
3488		 *
3489		 * NOTE:
3490		 * The TX bufring will not be drained by the hypervisor,
3491		 * if the primary channel is revoked.
3492		 */
3493		while (!vmbus_channel_tx_empty(sc->sc_prichan) &&
3494		    !vmbus_channel_is_revoked(sc->sc_prichan)) {
3495			DELAY(20);
3496			s = splnet();
3497			hvn_nvs_intr1(&sc->sc_rxr[0], sc->sc_tx_process_limit,
3498			    sc->sc_rx_process_limit);
3499			splx(s);
3500		}
3501		/*
3502		 * Linger long enough for NVS to disconnect chimney
3503		 * sending buffer.
3504		 */
3505		DELAY(200);
3506	}
3507
3508	if (sc->sc_prichan->ch_sc->sc_proto < VMBUS_VERSION_WIN10 &&
3509	    sc->sc_chim_hndl) {
3510		/*
3511		 * Disconnect chimney sending buffer from primary channel.
3512		 */
3513		vmbus_handle_free(sc->sc_prichan, sc->sc_chim_hndl);
3514		sc->sc_chim_hndl = 0;
3515	}
3516
3517	if (sc->sc_chim_bmap != NULL) {
3518		kmem_free(sc->sc_chim_bmap, sc->sc_chim_cnt / LONG_BIT);
3519		sc->sc_chim_bmap = NULL;
3520		sc->sc_chim_bmap_cnt = 0;
3521	}
3522
3523	mutex_destroy(&sc->sc_chim_bmap_lock);
3524
3525	return 0;
3526}
3527
3528#define HVN_HANDLE_RING_DOTX	__BIT(0)
3529
3530static int
3531hvn_handle_ring(struct hvn_rx_ring *rxr, int txlimit, int rxlimit)
3532{
3533	struct hvn_softc *sc = rxr->rxr_softc;
3534	struct vmbus_chanpkt_hdr *cph;
3535	const struct hvn_nvs_hdr *nvs;
3536	uint64_t rid;
3537	uint32_t rlen;
3538	int n, tx = 0, rx = 0;
3539	int result = 0;
3540	int rv;
3541
3542	mutex_enter(&rxr->rxr_lock);
3543	for (;;) {
3544		rv = vmbus_channel_recv(rxr->rxr_chan, rxr->rxr_nvsbuf,
3545		    HVN_NVS_BUFSIZE, &rlen, &rid, 1);
3546		if (rv != 0 || rlen == 0) {
3547			if (rv != EAGAIN)
3548				device_printf(sc->sc_dev,
3549				    "failed to receive an NVSP packet\n");
3550			break;
3551		}
3552		cph = (struct vmbus_chanpkt_hdr *)rxr->rxr_nvsbuf;
3553		nvs = (const struct hvn_nvs_hdr *)VMBUS_CHANPKT_CONST_DATA(cph);
3554
3555		if (cph->cph_type == VMBUS_CHANPKT_TYPE_COMP) {
3556			switch (nvs->nvs_type) {
3557			case HVN_NVS_TYPE_INIT_RESP:
3558			case HVN_NVS_TYPE_RXBUF_CONNRESP:
3559			case HVN_NVS_TYPE_CHIM_CONNRESP:
3560			case HVN_NVS_TYPE_SUBCH_RESP:
3561				mutex_enter(&sc->sc_nvsrsp_lock);
3562				/* copy the response back */
3563				memcpy(&sc->sc_nvsrsp, nvs, HVN_NVS_MSGSIZE);
3564				sc->sc_nvsdone = 1;
3565				cv_signal(&sc->sc_nvsrsp_cv);
3566				mutex_exit(&sc->sc_nvsrsp_lock);
3567				break;
3568			case HVN_NVS_TYPE_RNDIS_ACK:
3569				if (rxr->rxr_txr == NULL)
3570					break;
3571
3572				result |= HVN_HANDLE_RING_DOTX;
3573				mutex_enter(&rxr->rxr_txr->txr_lock);
3574				hvn_txeof(rxr->rxr_txr, cph->cph_tid);
3575				mutex_exit(&rxr->rxr_txr->txr_lock);
3576				if (txlimit > 0 && ++tx >= txlimit)
3577					goto out;
3578				break;
3579			default:
3580				device_printf(sc->sc_dev,
3581				    "unhandled NVSP packet type %u "
3582				    "on completion\n", nvs->nvs_type);
3583				break;
3584			}
3585		} else if (cph->cph_type == VMBUS_CHANPKT_TYPE_RXBUF) {
3586			switch (nvs->nvs_type) {
3587			case HVN_NVS_TYPE_RNDIS:
3588				n = hvn_rndis_input(rxr, cph->cph_tid, cph);
3589				if (rxlimit > 0) {
3590					if (n < 0)
3591						goto out;
3592					rx += n;
3593					if (rx >= rxlimit)
3594						goto out;
3595				}
3596				break;
3597			default:
3598				device_printf(sc->sc_dev,
3599				    "unhandled NVSP packet type %u "
3600				    "on receive\n", nvs->nvs_type);
3601				break;
3602			}
3603		} else if (cph->cph_type == VMBUS_CHANPKT_TYPE_INBAND) {
3604			switch (nvs->nvs_type) {
3605			case HVN_NVS_TYPE_TXTBL_NOTE:
3606				/* Useless; ignore */
3607				break;
3608			default:
3609				device_printf(sc->sc_dev,
3610				    "got notify, nvs type %u\n", nvs->nvs_type);
3611				break;
3612			}
3613		} else
3614			device_printf(sc->sc_dev,
3615			    "unknown NVSP packet type %u\n", cph->cph_type);
3616	}
3617out:
3618	mutex_exit(&rxr->rxr_lock);
3619
3620	return result;
3621}
3622
3623static void
3624hvn_nvs_intr1(struct hvn_rx_ring *rxr, int txlimit, int rxlimit)
3625{
3626	struct hvn_softc *sc = rxr->rxr_softc;
3627	struct ifnet *ifp = SC2IFP(sc);
3628	struct hvn_tx_ring *txr = rxr->rxr_txr;
3629	int result;
3630
3631	rxr->rxr_workqueue = sc->sc_txrx_workqueue;
3632
3633	result = hvn_handle_ring(rxr, txlimit, rxlimit);
3634
3635	if ((result & HVN_HANDLE_RING_DOTX) && txr != NULL) {
3636		mutex_enter(&txr->txr_lock);
3637		/* ALTQ */
3638		if (txr->txr_id == 0) {
3639			if_schedule_deferred_start(ifp);
3640		}
3641		softint_schedule(txr->txr_si);
3642		mutex_exit(&txr->txr_lock);
3643	}
3644}
3645
3646static void
3647hvn_schedule_handle_ring(struct hvn_softc *sc, struct hvn_rx_ring *rxr,
3648    bool intr)
3649{
3650
3651	KASSERT(mutex_owned(&rxr->rxr_onwork_lock));
3652
3653	if (rxr->rxr_workqueue) {
3654		if (!rxr->rxr_onlist) {
3655			rxr->rxr_onlist = true;
3656			if (intr)
3657				rxr->rxr_evdeferreq.ev_count++;
3658			else
3659				rxr->rxr_evredeferreq.ev_count++;
3660			workqueue_enqueue(sc->sc_wq, &rxr->rxr_wk, NULL);
3661		}
3662	} else {
3663		rxr->rxr_onlist = true;
3664		if (intr)
3665			rxr->rxr_evdeferreq.ev_count++;
3666		else
3667			rxr->rxr_evredeferreq.ev_count++;
3668		softint_schedule(rxr->rxr_si);
3669	}
3670}
3671
3672static void
3673hvn_handle_ring_common(struct hvn_rx_ring *rxr)
3674{
3675	struct hvn_softc *sc = rxr->rxr_softc;
3676	int txlimit = sc->sc_tx_process_limit;
3677	int rxlimit = sc->sc_rx_process_limit;
3678
3679	rxr->rxr_evdefer.ev_count++;
3680
3681	mutex_enter(&rxr->rxr_onwork_lock);
3682	rxr->rxr_onproc = true;
3683	rxr->rxr_onlist = false;
3684	mutex_exit(&rxr->rxr_onwork_lock);
3685
3686	hvn_nvs_intr1(rxr, txlimit, rxlimit);
3687
3688	mutex_enter(&rxr->rxr_onwork_lock);
3689	if (vmbus_channel_unpause(rxr->rxr_chan)) {
3690		vmbus_channel_pause(rxr->rxr_chan);
3691		hvn_schedule_handle_ring(sc, rxr, false);
3692	}
3693	rxr->rxr_onproc = false;
3694	cv_broadcast(&rxr->rxr_onwork_cv);
3695	mutex_exit(&rxr->rxr_onwork_lock);
3696}
3697
3698static void
3699hvn_handle_ring_work(struct work *wk, void *arg)
3700{
3701	struct hvn_rx_ring *rxr = container_of(wk, struct hvn_rx_ring, rxr_wk);
3702
3703	hvn_handle_ring_common(rxr);
3704}
3705
3706static void
3707hvn_nvs_softintr(void *arg)
3708{
3709	struct hvn_rx_ring *rxr = arg;
3710
3711	hvn_handle_ring_common(rxr);
3712}
3713
3714static void
3715hvn_nvs_intr(void *arg)
3716{
3717	struct hvn_rx_ring *rxr = arg;
3718	struct hvn_softc *sc = rxr->rxr_softc;
3719	int txlimit = cold ? 0 : sc->sc_tx_intr_process_limit;
3720	int rxlimit = cold ? 0 : sc->sc_rx_intr_process_limit;
3721
3722	rxr->rxr_evintr.ev_count++;
3723
3724	KASSERT(!rxr->rxr_onproc);
3725	KASSERT(!rxr->rxr_onlist);
3726
3727	vmbus_channel_pause(rxr->rxr_chan);
3728
3729	hvn_nvs_intr1(rxr, txlimit, rxlimit);
3730
3731	if (vmbus_channel_unpause(rxr->rxr_chan) && !cold) {
3732		vmbus_channel_pause(rxr->rxr_chan);
3733		mutex_enter(&rxr->rxr_onwork_lock);
3734		hvn_schedule_handle_ring(sc, rxr, true);
3735		mutex_exit(&rxr->rxr_onwork_lock);
3736	}
3737}
3738
3739static int
3740hvn_nvs_cmd(struct hvn_softc *sc, void *cmd, size_t cmdsize, uint64_t tid,
3741    u_int flags)
3742{
3743	struct hvn_rx_ring *rxr = &sc->sc_rxr[0];	/* primary channel */
3744	struct hvn_nvs_hdr *hdr = cmd;
3745	int tries = 10;
3746	int rv, s;
3747
3748	KASSERT(mutex_owned(&sc->sc_nvsrsp_lock));
3749
3750	sc->sc_nvsdone = 0;
3751
3752	do {
3753		rv = vmbus_channel_send(rxr->rxr_chan, cmd, cmdsize,
3754		    tid, VMBUS_CHANPKT_TYPE_INBAND,
3755		    ISSET(flags, HVN_NVS_CMD_NORESP) ? 0 :
3756		      VMBUS_CHANPKT_FLAG_RC);
3757		if (rv == EAGAIN) {
3758			DELAY(1000);
3759		} else if (rv) {
3760			DPRINTF("%s: NVSP operation %u send error %d\n",
3761			    device_xname(sc->sc_dev), hdr->nvs_type, rv);
3762			return rv;
3763		}
3764	} while (rv != 0 && --tries > 0);
3765
3766	if (tries == 0 && rv != 0) {
3767		device_printf(sc->sc_dev,
3768		    "NVSP operation %u send error %d\n", hdr->nvs_type, rv);
3769		return rv;
3770	}
3771
3772	if (ISSET(flags, HVN_NVS_CMD_NORESP))
3773		return 0;
3774
3775	while (!sc->sc_nvsdone && !ISSET(sc->sc_flags, HVN_SCF_REVOKED)) {
3776		mutex_exit(&sc->sc_nvsrsp_lock);
3777		DELAY(1000);
3778		s = splnet();
3779		hvn_nvs_intr1(rxr, 0, 0);
3780		splx(s);
3781		mutex_enter(&sc->sc_nvsrsp_lock);
3782	}
3783
3784	return 0;
3785}
3786
3787static int
3788hvn_nvs_ack(struct hvn_rx_ring *rxr, uint64_t tid)
3789{
3790	struct hvn_softc *sc __unused = rxr->rxr_softc;
3791	struct hvn_nvs_rndis_ack cmd;
3792	int tries = 5;
3793	int rv;
3794
3795	cmd.nvs_type = HVN_NVS_TYPE_RNDIS_ACK;
3796	cmd.nvs_status = HVN_NVS_STATUS_OK;
3797	do {
3798		rv = vmbus_channel_send(rxr->rxr_chan, &cmd, sizeof(cmd),
3799		    tid, VMBUS_CHANPKT_TYPE_COMP, 0);
3800		if (rv == EAGAIN)
3801			DELAY(10);
3802		else if (rv) {
3803			DPRINTF("%s: NVSP acknowledgement error %d\n",
3804			    device_xname(sc->sc_dev), rv);
3805			return rv;
3806		}
3807	} while (rv != 0 && --tries > 0);
3808	return rv;
3809}
3810
3811static void
3812hvn_nvs_detach(struct hvn_softc *sc)
3813{
3814
3815	hvn_nvs_disconnect_rxbuf(sc);
3816	hvn_nvs_disconnect_chim(sc);
3817}
3818
3819static int
3820hvn_nvs_alloc_subchannels(struct hvn_softc *sc, int *nsubchp)
3821{
3822	struct hvn_nvs_subch_req cmd;
3823	struct hvn_nvs_subch_resp *rsp;
3824	uint64_t tid;
3825	int nsubch, nsubch_req;
3826
3827	nsubch_req = *nsubchp;
3828	KASSERTMSG(nsubch_req > 0, "invalid # of sub-channels %d", nsubch_req);
3829
3830	memset(&cmd, 0, sizeof(cmd));
3831	cmd.nvs_type = HVN_NVS_TYPE_SUBCH_REQ;
3832	cmd.nvs_op = HVN_NVS_SUBCH_OP_ALLOC;
3833	cmd.nvs_nsubch = nsubch_req;
3834
3835	tid = atomic_inc_uint_nv(&sc->sc_nvstid);
3836	mutex_enter(&sc->sc_nvsrsp_lock);
3837	if (hvn_nvs_cmd(sc, &cmd, sizeof(cmd), tid, 0)) {
3838		mutex_exit(&sc->sc_nvsrsp_lock);
3839		return EIO;
3840	}
3841
3842	rsp = (struct hvn_nvs_subch_resp *)&sc->sc_nvsrsp;
3843	if (rsp->nvs_status != HVN_NVS_STATUS_OK) {
3844		mutex_exit(&sc->sc_nvsrsp_lock);
3845		DPRINTF("%s: failed to alloc sub-channels\n",
3846		    device_xname(sc->sc_dev));
3847		return EIO;
3848	}
3849
3850	nsubch = rsp->nvs_nsubch;
3851	if (nsubch > nsubch_req) {
3852		aprint_debug_dev(sc->sc_dev,
3853		    "%u subchans are allocated, requested %d\n",
3854		    nsubch, nsubch_req);
3855		nsubch = nsubch_req;
3856	}
3857	mutex_exit(&sc->sc_nvsrsp_lock);
3858
3859	*nsubchp = nsubch;
3860
3861	return 0;
3862}
3863
3864static inline struct rndis_cmd *
3865hvn_alloc_cmd(struct hvn_softc *sc)
3866{
3867	struct rndis_cmd *rc;
3868
3869	mutex_enter(&sc->sc_cntl_fqlck);
3870	while ((rc = TAILQ_FIRST(&sc->sc_cntl_fq)) == NULL)
3871		cv_wait(&sc->sc_cntl_fqcv, &sc->sc_cntl_fqlck);
3872	TAILQ_REMOVE(&sc->sc_cntl_fq, rc, rc_entry);
3873	mutex_exit(&sc->sc_cntl_fqlck);
3874	return rc;
3875}
3876
3877static inline void
3878hvn_submit_cmd(struct hvn_softc *sc, struct rndis_cmd *rc)
3879{
3880
3881	mutex_enter(&sc->sc_cntl_sqlck);
3882	TAILQ_INSERT_TAIL(&sc->sc_cntl_sq, rc, rc_entry);
3883	mutex_exit(&sc->sc_cntl_sqlck);
3884}
3885
3886static inline struct rndis_cmd *
3887hvn_complete_cmd(struct hvn_softc *sc, uint32_t id)
3888{
3889	struct rndis_cmd *rc;
3890
3891	mutex_enter(&sc->sc_cntl_sqlck);
3892	TAILQ_FOREACH(rc, &sc->sc_cntl_sq, rc_entry) {
3893		if (rc->rc_id == id) {
3894			TAILQ_REMOVE(&sc->sc_cntl_sq, rc, rc_entry);
3895			break;
3896		}
3897	}
3898	mutex_exit(&sc->sc_cntl_sqlck);
3899	if (rc != NULL) {
3900		mutex_enter(&sc->sc_cntl_cqlck);
3901		TAILQ_INSERT_TAIL(&sc->sc_cntl_cq, rc, rc_entry);
3902		mutex_exit(&sc->sc_cntl_cqlck);
3903	}
3904	return rc;
3905}
3906
3907static inline void
3908hvn_release_cmd(struct hvn_softc *sc, struct rndis_cmd *rc)
3909{
3910
3911	mutex_enter(&sc->sc_cntl_cqlck);
3912	TAILQ_REMOVE(&sc->sc_cntl_cq, rc, rc_entry);
3913	mutex_exit(&sc->sc_cntl_cqlck);
3914}
3915
3916static inline int
3917hvn_rollback_cmd(struct hvn_softc *sc, struct rndis_cmd *rc)
3918{
3919	struct rndis_cmd *rn;
3920
3921	mutex_enter(&sc->sc_cntl_sqlck);
3922	TAILQ_FOREACH(rn, &sc->sc_cntl_sq, rc_entry) {
3923		if (rn == rc) {
3924			TAILQ_REMOVE(&sc->sc_cntl_sq, rc, rc_entry);
3925			mutex_exit(&sc->sc_cntl_sqlck);
3926			return 0;
3927		}
3928	}
3929	mutex_exit(&sc->sc_cntl_sqlck);
3930	return -1;
3931}
3932
3933static inline void
3934hvn_free_cmd(struct hvn_softc *sc, struct rndis_cmd *rc)
3935{
3936
3937	memset(rc->rc_req, 0, sizeof(struct rndis_packet_msg));
3938	memset(&rc->rc_cmp, 0, sizeof(rc->rc_cmp));
3939	memset(&rc->rc_msg, 0, sizeof(rc->rc_msg));
3940	mutex_enter(&sc->sc_cntl_fqlck);
3941	TAILQ_INSERT_TAIL(&sc->sc_cntl_fq, rc, rc_entry);
3942	cv_signal(&sc->sc_cntl_fqcv);
3943	mutex_exit(&sc->sc_cntl_fqlck);
3944}
3945
3946static int
3947hvn_rndis_init(struct hvn_softc *sc)
3948{
3949	struct rndis_cmd *rc;
3950	int i;
3951
3952	/* RNDIS control message queues */
3953	TAILQ_INIT(&sc->sc_cntl_sq);
3954	TAILQ_INIT(&sc->sc_cntl_cq);
3955	TAILQ_INIT(&sc->sc_cntl_fq);
3956	mutex_init(&sc->sc_cntl_sqlck, MUTEX_DEFAULT, IPL_NET);
3957	mutex_init(&sc->sc_cntl_cqlck, MUTEX_DEFAULT, IPL_NET);
3958	mutex_init(&sc->sc_cntl_fqlck, MUTEX_DEFAULT, IPL_NET);
3959	cv_init(&sc->sc_cntl_fqcv, "nvsalloc");
3960
3961	for (i = 0; i < HVN_RNDIS_CTLREQS; i++) {
3962		rc = &sc->sc_cntl_msgs[i];
3963		if (bus_dmamap_create(sc->sc_dmat, PAGE_SIZE, 1, PAGE_SIZE, 0,
3964		    BUS_DMA_WAITOK, &rc->rc_dmap)) {
3965			DPRINTF("%s: failed to create RNDIS command map\n",
3966			    device_xname(sc->sc_dev));
3967			goto errout;
3968		}
3969		if (bus_dmamem_alloc(sc->sc_dmat, PAGE_SIZE, PAGE_SIZE,
3970		    0, &rc->rc_segs, 1, &rc->rc_nsegs, BUS_DMA_WAITOK)) {
3971			DPRINTF("%s: failed to allocate RNDIS command\n",
3972			    device_xname(sc->sc_dev));
3973			bus_dmamap_destroy(sc->sc_dmat, rc->rc_dmap);
3974			goto errout;
3975		}
3976		if (bus_dmamem_map(sc->sc_dmat, &rc->rc_segs, rc->rc_nsegs,
3977		    PAGE_SIZE, (void **)&rc->rc_req, BUS_DMA_WAITOK)) {
3978			DPRINTF("%s: failed to allocate RNDIS command\n",
3979			    device_xname(sc->sc_dev));
3980			bus_dmamem_free(sc->sc_dmat, &rc->rc_segs,
3981			    rc->rc_nsegs);
3982			bus_dmamap_destroy(sc->sc_dmat, rc->rc_dmap);
3983			goto errout;
3984		}
3985		memset(rc->rc_req, 0, PAGE_SIZE);
3986		if (bus_dmamap_load(sc->sc_dmat, rc->rc_dmap, rc->rc_req,
3987		    PAGE_SIZE, NULL, BUS_DMA_WAITOK)) {
3988			DPRINTF("%s: failed to load RNDIS command map\n",
3989			    device_xname(sc->sc_dev));
3990			bus_dmamem_unmap(sc->sc_dmat, rc->rc_req, PAGE_SIZE);
3991			rc->rc_req = NULL;
3992			bus_dmamem_free(sc->sc_dmat, &rc->rc_segs,
3993			    rc->rc_nsegs);
3994			bus_dmamap_destroy(sc->sc_dmat, rc->rc_dmap);
3995			goto errout;
3996		}
3997		rc->rc_gpa = atop(rc->rc_dmap->dm_segs[0].ds_addr);
3998		mutex_init(&rc->rc_lock, MUTEX_DEFAULT, IPL_NET);
3999		cv_init(&rc->rc_cv, "rndiscmd");
4000		TAILQ_INSERT_TAIL(&sc->sc_cntl_fq, rc, rc_entry);
4001	}
4002
4003	/* Initialize RNDIS Data command */
4004	memset(&sc->sc_data_msg, 0, sizeof(sc->sc_data_msg));
4005	sc->sc_data_msg.nvs_type = HVN_NVS_TYPE_RNDIS;
4006	sc->sc_data_msg.nvs_rndis_mtype = HVN_NVS_RNDIS_MTYPE_DATA;
4007	sc->sc_data_msg.nvs_chim_idx = HVN_NVS_CHIM_IDX_INVALID;
4008
4009	return 0;
4010
4011errout:
4012	hvn_rndis_destroy(sc);
4013	return -1;
4014}
4015
4016static void
4017hvn_rndis_destroy(struct hvn_softc *sc)
4018{
4019	struct rndis_cmd *rc;
4020	int i;
4021
4022	for (i = 0; i < HVN_RNDIS_CTLREQS; i++) {
4023		rc = &sc->sc_cntl_msgs[i];
4024		if (rc->rc_req == NULL)
4025			continue;
4026
4027		TAILQ_REMOVE(&sc->sc_cntl_fq, rc, rc_entry);
4028		bus_dmamap_unload(sc->sc_dmat, rc->rc_dmap);
4029		bus_dmamem_unmap(sc->sc_dmat, rc->rc_req, PAGE_SIZE);
4030		rc->rc_req = NULL;
4031		bus_dmamem_free(sc->sc_dmat, &rc->rc_segs, rc->rc_nsegs);
4032		bus_dmamap_destroy(sc->sc_dmat, rc->rc_dmap);
4033		mutex_destroy(&rc->rc_lock);
4034		cv_destroy(&rc->rc_cv);
4035	}
4036
4037	mutex_destroy(&sc->sc_cntl_sqlck);
4038	mutex_destroy(&sc->sc_cntl_cqlck);
4039	mutex_destroy(&sc->sc_cntl_fqlck);
4040	cv_destroy(&sc->sc_cntl_fqcv);
4041}
4042
4043static int
4044hvn_rndis_attach(struct hvn_softc *sc, int mtu)
4045{
4046	struct rndis_init_req *req;
4047	struct rndis_init_comp *cmp;
4048	struct rndis_cmd *rc;
4049	int rv;
4050
4051	rc = hvn_alloc_cmd(sc);
4052
4053	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
4054	    BUS_DMASYNC_PREREAD);
4055
4056	rc->rc_id = atomic_inc_uint_nv(&sc->sc_rndisrid);
4057
4058	req = rc->rc_req;
4059	req->rm_type = REMOTE_NDIS_INITIALIZE_MSG;
4060	req->rm_len = sizeof(*req);
4061	req->rm_rid = rc->rc_id;
4062	req->rm_ver_major = RNDIS_VERSION_MAJOR;
4063	req->rm_ver_minor = RNDIS_VERSION_MINOR;
4064	req->rm_max_xfersz = HVN_RNDIS_XFER_SIZE;
4065
4066	rc->rc_cmplen = sizeof(*cmp);
4067
4068	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
4069	    BUS_DMASYNC_PREWRITE);
4070
4071	if ((rv = hvn_rndis_cmd(sc, rc, 0)) != 0) {
4072		DPRINTF("%s: INITIALIZE_MSG failed, error %d\n",
4073		    device_xname(sc->sc_dev), rv);
4074		hvn_free_cmd(sc, rc);
4075		return -1;
4076	}
4077	cmp = (struct rndis_init_comp *)&rc->rc_cmp;
4078	if (cmp->rm_status != RNDIS_STATUS_SUCCESS) {
4079		DPRINTF("%s: failed to init RNDIS, error %#x\n",
4080		    device_xname(sc->sc_dev), cmp->rm_status);
4081		hvn_free_cmd(sc, rc);
4082		return -1;
4083	}
4084
4085	sc->sc_rndis_agg_size = cmp->rm_pktmaxsz;
4086	sc->sc_rndis_agg_pkts = cmp->rm_pktmaxcnt;
4087	sc->sc_rndis_agg_align = __BIT(cmp->rm_align);
4088
4089	if (sc->sc_rndis_agg_align < sizeof(uint32_t)) {
4090		/*
4091		 * The RNDIS packet message encap assumes that the RNDIS
4092		 * packet message is at least 4 bytes aligned.  Fix up the
4093		 * alignment here, if the remote side sets the alignment
4094		 * too low.
4095		 */
4096		aprint_verbose_dev(sc->sc_dev,
4097		    "fixup RNDIS aggpkt align: %u -> %zu\n",
4098		    sc->sc_rndis_agg_align, sizeof(uint32_t));
4099		sc->sc_rndis_agg_align = sizeof(uint32_t);
4100	}
4101
4102	aprint_verbose_dev(sc->sc_dev,
4103	    "RNDIS ver %u.%u, aggpkt size %u, aggpkt cnt %u, aggpkt align %u\n",
4104	    cmp->rm_ver_major, cmp->rm_ver_minor, sc->sc_rndis_agg_size,
4105	    sc->sc_rndis_agg_pkts, sc->sc_rndis_agg_align);
4106
4107	hvn_free_cmd(sc, rc);
4108
4109	return 0;
4110}
4111
4112static int
4113hvn_get_rsscaps(struct hvn_softc *sc, int *nrxr)
4114{
4115	struct ndis_rss_caps in, caps;
4116	size_t caps_len;
4117	int error, rxr_cnt, indsz, hash_fnidx;
4118	uint32_t hash_func = 0, hash_types = 0;
4119
4120	*nrxr = 0;
4121
4122	if (sc->sc_ndisver < NDIS_VERSION_6_20)
4123		return EOPNOTSUPP;
4124
4125	memset(&in, 0, sizeof(in));
4126	in.ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_CAPS;
4127	in.ndis_hdr.ndis_rev = NDIS_RSS_CAPS_REV_2;
4128	in.ndis_hdr.ndis_size = NDIS_RSS_CAPS_SIZE;
4129
4130	caps_len = NDIS_RSS_CAPS_SIZE;
4131	error = hvn_rndis_query2(sc, OID_GEN_RECEIVE_SCALE_CAPABILITIES,
4132	    &in, NDIS_RSS_CAPS_SIZE, &caps, &caps_len, NDIS_RSS_CAPS_SIZE_6_0);
4133	if (error)
4134		return error;
4135
4136	/*
4137	 * Preliminary verification.
4138	 */
4139	if (caps.ndis_hdr.ndis_type != NDIS_OBJTYPE_RSS_CAPS) {
4140		DPRINTF("%s: invalid NDIS objtype 0x%02x\n",
4141		    device_xname(sc->sc_dev), caps.ndis_hdr.ndis_type);
4142		return EINVAL;
4143	}
4144	if (caps.ndis_hdr.ndis_rev < NDIS_RSS_CAPS_REV_1) {
4145		DPRINTF("%s: invalid NDIS objrev 0x%02x\n",
4146		    device_xname(sc->sc_dev), caps.ndis_hdr.ndis_rev);
4147		return EINVAL;
4148	}
4149	if (caps.ndis_hdr.ndis_size > caps_len) {
4150		DPRINTF("%s: invalid NDIS objsize %u, data size %zu\n",
4151		    device_xname(sc->sc_dev), caps.ndis_hdr.ndis_size,
4152		    caps_len);
4153		return EINVAL;
4154	} else if (caps.ndis_hdr.ndis_size < NDIS_RSS_CAPS_SIZE_6_0) {
4155		DPRINTF("%s: invalid NDIS objsize %u\n",
4156		    device_xname(sc->sc_dev), caps.ndis_hdr.ndis_size);
4157		return EINVAL;
4158	}
4159
4160	/*
4161	 * Save information for later RSS configuration.
4162	 */
4163	if (caps.ndis_nrxr == 0) {
4164		DPRINTF("%s: 0 RX rings!?\n", device_xname(sc->sc_dev));
4165		return EINVAL;
4166	}
4167	rxr_cnt = caps.ndis_nrxr;
4168	aprint_debug_dev(sc->sc_dev, "%u Rx rings\n", rxr_cnt);
4169
4170	if (caps.ndis_hdr.ndis_size == NDIS_RSS_CAPS_SIZE &&
4171	    caps.ndis_hdr.ndis_rev >= NDIS_RSS_CAPS_REV_2) {
4172		if (caps.ndis_nind > NDIS_HASH_INDCNT) {
4173			DPRINTF("%s: too many RSS indirect table entries %u\n",
4174			    device_xname(sc->sc_dev), caps.ndis_nind);
4175			return EOPNOTSUPP;
4176		}
4177		if (!powerof2(caps.ndis_nind)) {
4178			DPRINTF("%s: RSS indirect table size is not power-of-2:"
4179			    " %u\n", device_xname(sc->sc_dev), caps.ndis_nind);
4180			return EOPNOTSUPP;
4181		}
4182
4183		indsz = caps.ndis_nind;
4184	} else {
4185		indsz = NDIS_HASH_INDCNT;
4186	}
4187	if (rxr_cnt > indsz) {
4188		aprint_debug_dev(sc->sc_dev,
4189		    "# of RX rings (%u) > RSS indirect table size %u\n",
4190		    rxr_cnt, indsz);
4191		rxr_cnt = indsz;
4192	}
4193
4194	/*
4195	 * NOTE:
4196	 * Toeplitz is at the lowest bit, and it is prefered; so ffs(),
4197	 * instead of fls(), is used here.
4198	 */
4199	hash_fnidx = ffs(caps.ndis_caps & NDIS_RSS_CAP_HASHFUNC_MASK);
4200	if (hash_fnidx == 0) {
4201		DPRINTF("%s: no hash functions, caps 0x%08x\n",
4202		    device_xname(sc->sc_dev), caps.ndis_caps);
4203		return EOPNOTSUPP;
4204	}
4205	hash_func = 1 << (hash_fnidx - 1);	/* ffs is 1-based */
4206
4207	if (caps.ndis_caps & NDIS_RSS_CAP_IPV4)
4208		hash_types |= NDIS_HASH_IPV4 | NDIS_HASH_TCP_IPV4;
4209	if (caps.ndis_caps & NDIS_RSS_CAP_IPV6)
4210		hash_types |= NDIS_HASH_IPV6 | NDIS_HASH_TCP_IPV6;
4211	if (caps.ndis_caps & NDIS_RSS_CAP_IPV6_EX)
4212		hash_types |= NDIS_HASH_IPV6_EX | NDIS_HASH_TCP_IPV6_EX;
4213	if (hash_types == 0) {
4214		DPRINTF("%s: no hash types, caps 0x%08x\n",
4215		    device_xname(sc->sc_dev), caps.ndis_caps);
4216		return EOPNOTSUPP;
4217	}
4218	aprint_debug_dev(sc->sc_dev, "RSS caps %#x\n", caps.ndis_caps);
4219
4220	sc->sc_rss_ind_size = indsz;
4221	sc->sc_rss_hcap = hash_func | hash_types;
4222	if (sc->sc_caps & HVN_CAPS_UDPHASH) {
4223		/* UDP 4-tuple hash is unconditionally enabled. */
4224		sc->sc_rss_hcap |= NDIS_HASH_UDP_IPV4_X;
4225	}
4226	*nrxr = rxr_cnt;
4227
4228	return 0;
4229}
4230
4231static int
4232hvn_set_rss(struct hvn_softc *sc, uint16_t flags)
4233{
4234	struct ndis_rssprm_toeplitz *rss = &sc->sc_rss;
4235	struct ndis_rss_params *params = &rss->rss_params;
4236	int len;
4237
4238	/*
4239	 * Only NDIS 6.20+ is supported:
4240	 * We only support 4bytes element in indirect table, which has been
4241	 * adopted since NDIS 6.20.
4242	 */
4243	if (sc->sc_ndisver < NDIS_VERSION_6_20)
4244		return 0;
4245
4246	/* XXX only one can be specified through, popcnt? */
4247	KASSERTMSG((sc->sc_rss_hash & NDIS_HASH_FUNCTION_MASK),
4248	    "no hash func %08x", sc->sc_rss_hash);
4249	KASSERTMSG((sc->sc_rss_hash & NDIS_HASH_STD),
4250	    "no standard hash types %08x", sc->sc_rss_hash);
4251	KASSERTMSG(sc->sc_rss_ind_size > 0, "no indirect table size");
4252
4253	aprint_debug_dev(sc->sc_dev, "RSS indirect table size %d, hash %#x\n",
4254	    sc->sc_rss_ind_size, sc->sc_rss_hash);
4255
4256	len = NDIS_RSSPRM_TOEPLITZ_SIZE(sc->sc_rss_ind_size);
4257
4258	memset(params, 0, sizeof(*params));
4259	params->ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_PARAMS;
4260	params->ndis_hdr.ndis_rev = NDIS_RSS_PARAMS_REV_2;
4261	params->ndis_hdr.ndis_size = len;
4262	params->ndis_flags = flags;
4263	params->ndis_hash =
4264	    sc->sc_rss_hash & (NDIS_HASH_FUNCTION_MASK | NDIS_HASH_STD);
4265	params->ndis_indsize = sizeof(rss->rss_ind[0]) * sc->sc_rss_ind_size;
4266	params->ndis_indoffset =
4267	    offsetof(struct ndis_rssprm_toeplitz, rss_ind[0]);
4268	params->ndis_keysize = sizeof(rss->rss_key);
4269	params->ndis_keyoffset =
4270	    offsetof(struct ndis_rssprm_toeplitz, rss_key[0]);
4271
4272	return hvn_rndis_set(sc, OID_GEN_RECEIVE_SCALE_PARAMETERS, rss, len);
4273}
4274
4275static void
4276hvn_fixup_rss_ind(struct hvn_softc *sc)
4277{
4278	struct ndis_rssprm_toeplitz *rss = &sc->sc_rss;
4279	int i, nchan;
4280
4281	nchan = sc->sc_nrxr_inuse;
4282	KASSERTMSG(nchan > 1, "invalid # of channels %d", nchan);
4283
4284	/*
4285	 * Check indirect table to make sure that all channels in it
4286	 * can be used.
4287	 */
4288	for (i = 0; i < NDIS_HASH_INDCNT; i++) {
4289		if (rss->rss_ind[i] >= nchan) {
4290			DPRINTF("%s: RSS indirect table %d fixup: %u -> %d\n",
4291			    device_xname(sc->sc_dev), i, rss->rss_ind[i],
4292			    nchan - 1);
4293			rss->rss_ind[i] = nchan - 1;
4294		}
4295	}
4296}
4297
4298static int
4299hvn_get_hwcaps(struct hvn_softc *sc, struct ndis_offload *caps)
4300{
4301	struct ndis_offload in;
4302	size_t caps_len, len;
4303	int error;
4304
4305	memset(&in, 0, sizeof(in));
4306	in.ndis_hdr.ndis_type = NDIS_OBJTYPE_OFFLOAD;
4307	if (sc->sc_ndisver >= NDIS_VERSION_6_30) {
4308		in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_3;
4309		len = in.ndis_hdr.ndis_size = NDIS_OFFLOAD_SIZE;
4310	} else if (sc->sc_ndisver >= NDIS_VERSION_6_1) {
4311		in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_2;
4312		len = in.ndis_hdr.ndis_size = NDIS_OFFLOAD_SIZE_6_1;
4313	} else {
4314		in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_1;
4315		len = in.ndis_hdr.ndis_size = NDIS_OFFLOAD_SIZE_6_0;
4316	}
4317
4318	caps_len = NDIS_OFFLOAD_SIZE;
4319	error = hvn_rndis_query2(sc, OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES,
4320	    &in, len, caps, &caps_len, NDIS_OFFLOAD_SIZE_6_0);
4321	if (error)
4322		return error;
4323
4324	/*
4325	 * Preliminary verification.
4326	 */
4327	if (caps->ndis_hdr.ndis_type != NDIS_OBJTYPE_OFFLOAD) {
4328		DPRINTF("%s: invalid NDIS objtype 0x%02x\n",
4329		    device_xname(sc->sc_dev), caps->ndis_hdr.ndis_type);
4330		return EINVAL;
4331	}
4332	if (caps->ndis_hdr.ndis_rev < NDIS_OFFLOAD_REV_1) {
4333		DPRINTF("%s: invalid NDIS objrev 0x%02x\n",
4334		    device_xname(sc->sc_dev), caps->ndis_hdr.ndis_rev);
4335		return EINVAL;
4336	}
4337	if (caps->ndis_hdr.ndis_size > caps_len) {
4338		DPRINTF("%s: invalid NDIS objsize %u, data size %zu\n",
4339		    device_xname(sc->sc_dev), caps->ndis_hdr.ndis_size,
4340		    caps_len);
4341		return EINVAL;
4342	} else if (caps->ndis_hdr.ndis_size < NDIS_OFFLOAD_SIZE_6_0) {
4343		DPRINTF("%s: invalid NDIS objsize %u\n",
4344		    device_xname(sc->sc_dev), caps->ndis_hdr.ndis_size);
4345		return EINVAL;
4346	}
4347
4348	/*
4349	 * NOTE:
4350	 * caps->ndis_hdr.ndis_size MUST be checked before accessing
4351	 * NDIS 6.1+ specific fields.
4352	 */
4353	aprint_debug_dev(sc->sc_dev, "hwcaps rev %u\n",
4354	    caps->ndis_hdr.ndis_rev);
4355
4356	aprint_debug_dev(sc->sc_dev, "hwcaps csum: "
4357	    "ip4 tx 0x%x/0x%x rx 0x%x/0x%x, "
4358	    "ip6 tx 0x%x/0x%x rx 0x%x/0x%x\n",
4359	    caps->ndis_csum.ndis_ip4_txcsum, caps->ndis_csum.ndis_ip4_txenc,
4360	    caps->ndis_csum.ndis_ip4_rxcsum, caps->ndis_csum.ndis_ip4_rxenc,
4361	    caps->ndis_csum.ndis_ip6_txcsum, caps->ndis_csum.ndis_ip6_txenc,
4362	    caps->ndis_csum.ndis_ip6_rxcsum, caps->ndis_csum.ndis_ip6_rxenc);
4363	aprint_debug_dev(sc->sc_dev, "hwcaps lsov2: "
4364	    "ip4 maxsz %u minsg %u encap 0x%x, "
4365	    "ip6 maxsz %u minsg %u encap 0x%x opts 0x%x\n",
4366	    caps->ndis_lsov2.ndis_ip4_maxsz, caps->ndis_lsov2.ndis_ip4_minsg,
4367	    caps->ndis_lsov2.ndis_ip4_encap, caps->ndis_lsov2.ndis_ip6_maxsz,
4368	    caps->ndis_lsov2.ndis_ip6_minsg, caps->ndis_lsov2.ndis_ip6_encap,
4369	    caps->ndis_lsov2.ndis_ip6_opts);
4370
4371	return 0;
4372}
4373
4374static int
4375hvn_set_capabilities(struct hvn_softc *sc, int mtu)
4376{
4377	struct ndis_offload hwcaps;
4378	struct ndis_offload_params params;
4379	size_t len;
4380	uint32_t caps = 0;
4381	int error, tso_maxsz, tso_minsg;
4382
4383	error = hvn_get_hwcaps(sc, &hwcaps);
4384	if (error) {
4385		DPRINTF("%s: failed to query hwcaps\n",
4386		    device_xname(sc->sc_dev));
4387		return error;
4388	}
4389
4390	/* NOTE: 0 means "no change" */
4391	memset(&params, 0, sizeof(params));
4392
4393	params.ndis_hdr.ndis_type = NDIS_OBJTYPE_DEFAULT;
4394	if (sc->sc_ndisver < NDIS_VERSION_6_30) {
4395		params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_2;
4396		len = params.ndis_hdr.ndis_size = NDIS_OFFLOAD_PARAMS_SIZE_6_1;
4397	} else {
4398		params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_3;
4399		len = params.ndis_hdr.ndis_size = NDIS_OFFLOAD_PARAMS_SIZE;
4400	}
4401
4402	/*
4403	 * TSO4/TSO6 setup.
4404	 */
4405	tso_maxsz = IP_MAXPACKET;
4406	tso_minsg = 2;
4407	if (hwcaps.ndis_lsov2.ndis_ip4_encap & NDIS_OFFLOAD_ENCAP_8023) {
4408		caps |= HVN_CAPS_TSO4;
4409		params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_ON;
4410
4411		if (hwcaps.ndis_lsov2.ndis_ip4_maxsz < tso_maxsz)
4412			tso_maxsz = hwcaps.ndis_lsov2.ndis_ip4_maxsz;
4413		if (hwcaps.ndis_lsov2.ndis_ip4_minsg > tso_minsg)
4414			tso_minsg = hwcaps.ndis_lsov2.ndis_ip4_minsg;
4415	}
4416	if ((hwcaps.ndis_lsov2.ndis_ip6_encap & NDIS_OFFLOAD_ENCAP_8023) &&
4417	    (hwcaps.ndis_lsov2.ndis_ip6_opts & HVN_NDIS_LSOV2_CAP_IP6) ==
4418	    HVN_NDIS_LSOV2_CAP_IP6) {
4419		caps |= HVN_CAPS_TSO6;
4420		params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_ON;
4421
4422		if (hwcaps.ndis_lsov2.ndis_ip6_maxsz < tso_maxsz)
4423			tso_maxsz = hwcaps.ndis_lsov2.ndis_ip6_maxsz;
4424		if (hwcaps.ndis_lsov2.ndis_ip6_minsg > tso_minsg)
4425			tso_minsg = hwcaps.ndis_lsov2.ndis_ip6_minsg;
4426	}
4427	sc->sc_tso_szmax = 0;
4428	sc->sc_tso_sgmin = 0;
4429	if (caps & (HVN_CAPS_TSO4 | HVN_CAPS_TSO6)) {
4430		KASSERTMSG(tso_maxsz <= IP_MAXPACKET,
4431		    "invalid NDIS TSO maxsz %d", tso_maxsz);
4432		KASSERTMSG(tso_minsg >= 2,
4433		    "invalid NDIS TSO minsg %d", tso_minsg);
4434		if (tso_maxsz < tso_minsg * mtu) {
4435			DPRINTF("%s: invalid NDIS TSO config: "
4436			    "maxsz %d, minsg %d, mtu %d; "
4437			    "disable TSO4 and TSO6\n", device_xname(sc->sc_dev),
4438			    tso_maxsz, tso_minsg, mtu);
4439			caps &= ~(HVN_CAPS_TSO4 | HVN_CAPS_TSO6);
4440			params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_OFF;
4441			params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_OFF;
4442		} else {
4443			sc->sc_tso_szmax = tso_maxsz;
4444			sc->sc_tso_sgmin = tso_minsg;
4445			aprint_debug_dev(sc->sc_dev,
4446			    "NDIS TSO szmax %d sgmin %d\n",
4447			    sc->sc_tso_szmax, sc->sc_tso_sgmin);
4448		}
4449	}
4450
4451	/* IPv4 checksum */
4452	if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HVN_NDIS_TXCSUM_CAP_IP4) ==
4453	    HVN_NDIS_TXCSUM_CAP_IP4) {
4454		caps |= HVN_CAPS_IPCS;
4455		params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TX;
4456	}
4457	if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_IP4) {
4458		if (params.ndis_ip4csum == NDIS_OFFLOAD_PARAM_TX)
4459			params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TXRX;
4460		else
4461			params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_RX;
4462	}
4463
4464	/* TCP4 checksum */
4465	if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HVN_NDIS_TXCSUM_CAP_TCP4) ==
4466	    HVN_NDIS_TXCSUM_CAP_TCP4) {
4467		caps |= HVN_CAPS_TCP4CS;
4468		params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TX;
4469	}
4470	if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_TCP4) {
4471		if (params.ndis_tcp4csum == NDIS_OFFLOAD_PARAM_TX)
4472			params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TXRX;
4473		else
4474			params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_RX;
4475	}
4476
4477	/* UDP4 checksum */
4478	if (hwcaps.ndis_csum.ndis_ip4_txcsum & NDIS_TXCSUM_CAP_UDP4) {
4479		caps |= HVN_CAPS_UDP4CS;
4480		params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TX;
4481	}
4482	if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_UDP4) {
4483		if (params.ndis_udp4csum == NDIS_OFFLOAD_PARAM_TX)
4484			params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TXRX;
4485		else
4486			params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_RX;
4487	}
4488
4489	/* TCP6 checksum */
4490	if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HVN_NDIS_TXCSUM_CAP_TCP6) ==
4491	    HVN_NDIS_TXCSUM_CAP_TCP6) {
4492		caps |= HVN_CAPS_TCP6CS;
4493		params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TX;
4494	}
4495	if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_TCP6) {
4496		if (params.ndis_tcp6csum == NDIS_OFFLOAD_PARAM_TX)
4497			params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TXRX;
4498		else
4499			params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_RX;
4500	}
4501
4502	/* UDP6 checksum */
4503	if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HVN_NDIS_TXCSUM_CAP_UDP6) ==
4504	    HVN_NDIS_TXCSUM_CAP_UDP6) {
4505		caps |= HVN_CAPS_UDP6CS;
4506		params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TX;
4507	}
4508	if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_UDP6) {
4509		if (params.ndis_udp6csum == NDIS_OFFLOAD_PARAM_TX)
4510			params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TXRX;
4511		else
4512			params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_RX;
4513	}
4514
4515	aprint_debug_dev(sc->sc_dev, "offload csum: "
4516	    "ip4 %u, tcp4 %u, udp4 %u, tcp6 %u, udp6 %u\n",
4517	    params.ndis_ip4csum, params.ndis_tcp4csum, params.ndis_udp4csum,
4518	    params.ndis_tcp6csum, params.ndis_udp6csum);
4519	aprint_debug_dev(sc->sc_dev, "offload lsov2: ip4 %u, ip6 %u\n",
4520	    params.ndis_lsov2_ip4, params.ndis_lsov2_ip6);
4521
4522	error = hvn_rndis_set(sc, OID_TCP_OFFLOAD_PARAMETERS, &params, len);
4523	if (error) {
4524		DPRINTF("%s: offload config failed: %d\n",
4525		    device_xname(sc->sc_dev), error);
4526		return error;
4527	}
4528
4529	aprint_debug_dev(sc->sc_dev, "offload config done\n");
4530	sc->sc_caps |= caps;
4531
4532	return 0;
4533}
4534
4535static int
4536hvn_rndis_cmd(struct hvn_softc *sc, struct rndis_cmd *rc, u_int flags)
4537{
4538	struct hvn_rx_ring *rxr = &sc->sc_rxr[0];	/* primary channel */
4539	struct hvn_nvs_rndis *msg = &rc->rc_msg;
4540	struct rndis_msghdr *hdr = rc->rc_req;
4541	struct vmbus_gpa sgl[1];
4542	int tries = 10;
4543	int rv, s;
4544
4545	msg->nvs_type = HVN_NVS_TYPE_RNDIS;
4546	msg->nvs_rndis_mtype = HVN_NVS_RNDIS_MTYPE_CTRL;
4547	msg->nvs_chim_idx = HVN_NVS_CHIM_IDX_INVALID;
4548
4549	sgl[0].gpa_page = rc->rc_gpa;
4550	sgl[0].gpa_len = hdr->rm_len;
4551	sgl[0].gpa_ofs = 0;
4552
4553	rc->rc_done = 0;
4554
4555	mutex_enter(&rc->rc_lock);
4556
4557	hvn_submit_cmd(sc, rc);
4558
4559	do {
4560		rv = vmbus_channel_send_sgl(rxr->rxr_chan, sgl, 1, &rc->rc_msg,
4561		    sizeof(*msg), rc->rc_id);
4562		if (rv == EAGAIN) {
4563			DELAY(1000);
4564		} else if (rv) {
4565			mutex_exit(&rc->rc_lock);
4566			DPRINTF("%s: RNDIS operation %u send error %d\n",
4567			    device_xname(sc->sc_dev), hdr->rm_type, rv);
4568			hvn_rollback_cmd(sc, rc);
4569			return rv;
4570		}
4571	} while (rv != 0 && --tries > 0);
4572
4573	if (tries == 0 && rv != 0) {
4574		mutex_exit(&rc->rc_lock);
4575		device_printf(sc->sc_dev,
4576		    "RNDIS operation %u send error %d\n", hdr->rm_type, rv);
4577		hvn_rollback_cmd(sc, rc);
4578		return rv;
4579	}
4580	if (vmbus_channel_is_revoked(rxr->rxr_chan) ||
4581	    ISSET(flags, HVN_RNDIS_CMD_NORESP)) {
4582		/* No response */
4583		mutex_exit(&rc->rc_lock);
4584		if (hvn_rollback_cmd(sc, rc))
4585			hvn_release_cmd(sc, rc);
4586		return 0;
4587	}
4588
4589	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
4590	    BUS_DMASYNC_POSTWRITE);
4591
4592	while (!rc->rc_done && !ISSET(sc->sc_flags, HVN_SCF_REVOKED)) {
4593		mutex_exit(&rc->rc_lock);
4594		DELAY(1000);
4595		s = splnet();
4596		hvn_nvs_intr1(rxr, 0, 0);
4597		splx(s);
4598		mutex_enter(&rc->rc_lock);
4599	}
4600	mutex_exit(&rc->rc_lock);
4601
4602	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
4603	    BUS_DMASYNC_POSTREAD);
4604
4605	if (!rc->rc_done) {
4606		rv = EINTR;
4607		if (hvn_rollback_cmd(sc, rc)) {
4608			hvn_release_cmd(sc, rc);
4609			rv = 0;
4610		}
4611		return rv;
4612	}
4613
4614	hvn_release_cmd(sc, rc);
4615	return 0;
4616}
4617
4618static int
4619hvn_rndis_input(struct hvn_rx_ring *rxr, uint64_t tid, void *arg)
4620{
4621	struct hvn_softc *sc = rxr->rxr_softc;
4622	struct vmbus_chanpkt_prplist *cp = arg;
4623	uint32_t off, len, type;
4624	int i, rv, rx = 0;
4625	bool qfull = false;
4626
4627	if (sc->sc_rx_ring == NULL) {
4628		DPRINTF("%s: invalid rx ring\n", device_xname(sc->sc_dev));
4629		return 0;
4630	}
4631
4632	for (i = 0; i < cp->cp_range_cnt; i++) {
4633		off = cp->cp_range[i].gpa_ofs;
4634		len = cp->cp_range[i].gpa_len;
4635
4636		KASSERT(off + len <= sc->sc_rx_size);
4637		KASSERT(len >= RNDIS_HEADER_OFFSET + 4);
4638
4639		memcpy(&type, sc->sc_rx_ring + off, sizeof(type));
4640		switch (type) {
4641		/* data message */
4642		case REMOTE_NDIS_PACKET_MSG:
4643			rv = hvn_rxeof(rxr, sc->sc_rx_ring + off, len);
4644			if (rv == 1)
4645				rx++;
4646			else if (rv == -1)	/* The receive queue is full. */
4647				qfull = true;
4648			break;
4649		/* completion messages */
4650		case REMOTE_NDIS_INITIALIZE_CMPLT:
4651		case REMOTE_NDIS_QUERY_CMPLT:
4652		case REMOTE_NDIS_SET_CMPLT:
4653		case REMOTE_NDIS_RESET_CMPLT:
4654		case REMOTE_NDIS_KEEPALIVE_CMPLT:
4655			hvn_rndis_complete(sc, sc->sc_rx_ring + off, len);
4656			break;
4657		/* notification message */
4658		case REMOTE_NDIS_INDICATE_STATUS_MSG:
4659			hvn_rndis_status(sc, sc->sc_rx_ring + off, len);
4660			break;
4661		default:
4662			device_printf(sc->sc_dev,
4663			    "unhandled RNDIS message type %u\n", type);
4664			break;
4665		}
4666	}
4667
4668	hvn_nvs_ack(rxr, tid);
4669
4670	if (qfull)
4671		return -1;
4672	return rx;
4673}
4674
4675static inline struct mbuf *
4676hvn_devget(struct hvn_softc *sc, void *buf, uint32_t len)
4677{
4678	struct ifnet *ifp = SC2IFP(sc);
4679	struct mbuf *m;
4680	size_t size = len + ETHER_ALIGN + ETHER_VLAN_ENCAP_LEN;
4681
4682	MGETHDR(m, M_NOWAIT, MT_DATA);
4683	if (m == NULL)
4684		return NULL;
4685
4686	if (size > MHLEN) {
4687		if (size <= MCLBYTES)
4688			MCLGET(m, M_NOWAIT);
4689		else
4690			MEXTMALLOC(m, size, M_NOWAIT);
4691		if ((m->m_flags & M_EXT) == 0) {
4692			m_freem(m);
4693			return NULL;
4694		}
4695	}
4696
4697	m->m_len = m->m_pkthdr.len = size;
4698	m_adj(m, ETHER_ALIGN + ETHER_VLAN_ENCAP_LEN);
4699	m_copyback(m, 0, len, buf);
4700	m_set_rcvif(m, ifp);
4701	return m;
4702}
4703
4704#define HVN_RXINFO_CSUM		__BIT(NDIS_PKTINFO_TYPE_CSUM)
4705#define HVN_RXINFO_VLAN		__BIT(NDIS_PKTINFO_TYPE_VLAN)
4706#define HVN_RXINFO_HASHVAL	__BIT(HVN_NDIS_PKTINFO_TYPE_HASHVAL)
4707#define HVN_RXINFO_HASHINFO	__BIT(HVN_NDIS_PKTINFO_TYPE_HASHINF)
4708#define HVN_RXINFO_ALL		(HVN_RXINFO_CSUM | \
4709				 HVN_RXINFO_VLAN | \
4710				 HVN_RXINFO_HASHVAL | \
4711				 HVN_RXINFO_HASHINFO)
4712
4713static int
4714hvn_rxeof(struct hvn_rx_ring *rxr, uint8_t *buf, uint32_t len)
4715{
4716	struct hvn_softc *sc = rxr->rxr_softc;
4717	struct ifnet *ifp = SC2IFP(sc);
4718	struct rndis_packet_msg *pkt;
4719	struct rndis_pktinfo *pi;
4720	struct mbuf *m;
4721	uint32_t mask, csum, vlan, hashval, hashinfo;
4722
4723	if (!(ifp->if_flags & IFF_RUNNING))
4724		return 0;
4725
4726	if (len < sizeof(*pkt)) {
4727		device_printf(sc->sc_dev, "data packet too short: %u\n",
4728		    len);
4729		return 0;
4730	}
4731
4732	pkt = (struct rndis_packet_msg *)buf;
4733	if (pkt->rm_dataoffset + pkt->rm_datalen > len) {
4734		device_printf(sc->sc_dev,
4735		    "data packet out of bounds: %u@%u\n", pkt->rm_dataoffset,
4736		    pkt->rm_datalen);
4737		return 0;
4738	}
4739
4740	if ((m = hvn_devget(sc, buf + RNDIS_HEADER_OFFSET + pkt->rm_dataoffset,
4741	    pkt->rm_datalen)) == NULL) {
4742		if_statinc(ifp, if_ierrors);
4743		return 0;
4744	}
4745
4746	if (pkt->rm_pktinfooffset + pkt->rm_pktinfolen > len) {
4747		device_printf(sc->sc_dev,
4748		    "pktinfo is out of bounds: %u@%u vs %u\n",
4749		    pkt->rm_pktinfolen, pkt->rm_pktinfooffset, len);
4750		goto done;
4751	}
4752
4753	mask = csum = hashval = hashinfo = 0;
4754	vlan = 0xffffffff;
4755	pi = (struct rndis_pktinfo *)(buf + RNDIS_HEADER_OFFSET +
4756	    pkt->rm_pktinfooffset);
4757	while (pkt->rm_pktinfolen > 0) {
4758		if (pi->rm_size > pkt->rm_pktinfolen) {
4759			device_printf(sc->sc_dev,
4760			    "invalid pktinfo size: %u/%u\n", pi->rm_size,
4761			    pkt->rm_pktinfolen);
4762			break;
4763		}
4764
4765		switch (pi->rm_type) {
4766		case NDIS_PKTINFO_TYPE_CSUM:
4767			memcpy(&csum, pi->rm_data, sizeof(csum));
4768			SET(mask, HVN_RXINFO_CSUM);
4769			break;
4770		case NDIS_PKTINFO_TYPE_VLAN:
4771			memcpy(&vlan, pi->rm_data, sizeof(vlan));
4772			SET(mask, HVN_RXINFO_VLAN);
4773			break;
4774		case HVN_NDIS_PKTINFO_TYPE_HASHVAL:
4775			memcpy(&hashval, pi->rm_data, sizeof(hashval));
4776			SET(mask, HVN_RXINFO_HASHVAL);
4777			break;
4778		case HVN_NDIS_PKTINFO_TYPE_HASHINF:
4779			memcpy(&hashinfo, pi->rm_data, sizeof(hashinfo));
4780			SET(mask, HVN_RXINFO_HASHINFO);
4781			break;
4782		default:
4783			DPRINTF("%s: unhandled pktinfo type %u\n",
4784			    device_xname(sc->sc_dev), pi->rm_type);
4785			goto next;
4786		}
4787
4788		if (mask == HVN_RXINFO_ALL) {
4789			/* All found; done */
4790			break;
4791		}
4792 next:
4793		pkt->rm_pktinfolen -= pi->rm_size;
4794		pi = (struct rndis_pktinfo *)((char *)pi + pi->rm_size);
4795	}
4796
4797	/*
4798	 * Final fixup.
4799	 * - If there is no hash value, invalidate the hash info.
4800	 */
4801	if (!ISSET(mask, HVN_RXINFO_HASHVAL))
4802		hashinfo = 0;
4803
4804	if (csum != 0) {
4805		if (ISSET(csum, NDIS_RXCSUM_INFO_IPCS_OK) &&
4806			ISSET(ifp->if_csum_flags_rx, M_CSUM_IPv4)) {
4807			SET(m->m_pkthdr.csum_flags, M_CSUM_IPv4);
4808			rxr->rxr_evcsum_ip.ev_count++;
4809		}
4810		if (ISSET(csum, NDIS_RXCSUM_INFO_TCPCS_OK) &&
4811			ISSET(ifp->if_csum_flags_rx, M_CSUM_TCPv4)) {
4812			SET(m->m_pkthdr.csum_flags, M_CSUM_TCPv4);
4813			rxr->rxr_evcsum_tcp.ev_count++;
4814		}
4815		if (ISSET(csum, NDIS_RXCSUM_INFO_UDPCS_OK) &&
4816			ISSET(ifp->if_csum_flags_rx, M_CSUM_UDPv4)) {
4817			SET(m->m_pkthdr.csum_flags, M_CSUM_UDPv4);
4818			rxr->rxr_evcsum_udp.ev_count++;
4819		}
4820	}
4821
4822	if (vlan != 0xffffffff) {
4823		uint16_t t = NDIS_VLAN_INFO_ID(vlan);
4824		t |= NDIS_VLAN_INFO_PRI(vlan) << EVL_PRIO_BITS;
4825		t |= NDIS_VLAN_INFO_CFI(vlan) << EVL_CFI_BITS;
4826
4827		if (ISSET(sc->sc_ec.ec_capenable, ETHERCAP_VLAN_HWTAGGING)) {
4828			vlan_set_tag(m, t);
4829			rxr->rxr_evvlanhwtagging.ev_count++;
4830		} else {
4831			struct ether_header eh;
4832			struct ether_vlan_header *evl;
4833
4834			KDASSERT(m->m_pkthdr.len >= sizeof(eh));
4835			m_copydata(m, 0, sizeof(eh), &eh);
4836			M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
4837			KDASSERT(m != NULL);
4838
4839			evl = mtod(m, struct ether_vlan_header *);
4840			memcpy(evl->evl_dhost, eh.ether_dhost,
4841			    ETHER_ADDR_LEN * 2);
4842			evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
4843			evl->evl_tag = htons(t);
4844			evl->evl_proto = eh.ether_type;
4845		}
4846	}
4847
4848	/* XXX RSS hash is not supported. */
4849
4850 done:
4851	rxr->rxr_evpkts.ev_count++;
4852	if_percpuq_enqueue(sc->sc_ipq, m);
4853	/* XXX Unable to detect that the receive queue is full. */
4854	return 1;
4855}
4856
4857static void
4858hvn_rndis_complete(struct hvn_softc *sc, uint8_t *buf, uint32_t len)
4859{
4860	struct rndis_cmd *rc;
4861	uint32_t id;
4862
4863	memcpy(&id, buf + RNDIS_HEADER_OFFSET, sizeof(id));
4864	if ((rc = hvn_complete_cmd(sc, id)) != NULL) {
4865		mutex_enter(&rc->rc_lock);
4866		if (len < rc->rc_cmplen)
4867			device_printf(sc->sc_dev,
4868			    "RNDIS response %u too short: %u\n", id, len);
4869		else
4870			memcpy(&rc->rc_cmp, buf, rc->rc_cmplen);
4871		if (len > rc->rc_cmplen &&
4872		    len - rc->rc_cmplen > HVN_RNDIS_BUFSIZE)
4873			device_printf(sc->sc_dev,
4874			    "RNDIS response %u too large: %u\n", id, len);
4875		else if (len > rc->rc_cmplen)
4876			memcpy(&rc->rc_cmpbuf, buf + rc->rc_cmplen,
4877			    len - rc->rc_cmplen);
4878		rc->rc_done = 1;
4879		cv_signal(&rc->rc_cv);
4880		mutex_exit(&rc->rc_lock);
4881	} else {
4882		DPRINTF("%s: failed to complete RNDIS request id %u\n",
4883		    device_xname(sc->sc_dev), id);
4884	}
4885}
4886
4887static int
4888hvn_rndis_output_sgl(struct hvn_tx_ring *txr, struct hvn_tx_desc *txd)
4889{
4890	struct hvn_softc *sc = txr->txr_softc;
4891	uint64_t rid = (uint64_t)txd->txd_id << 32;
4892	int rv;
4893
4894	rv = vmbus_channel_send_sgl(txr->txr_chan, txd->txd_sgl, txd->txd_nsge,
4895	    &sc->sc_data_msg, sizeof(sc->sc_data_msg), rid);
4896	if (rv) {
4897		DPRINTF("%s: RNDIS data send error %d\n",
4898		    device_xname(sc->sc_dev), rv);
4899		return rv;
4900	}
4901	return 0;
4902}
4903
4904static int
4905hvn_rndis_output_chim(struct hvn_tx_ring *txr, struct hvn_tx_desc *txd)
4906{
4907	struct hvn_nvs_rndis rndis;
4908	uint64_t rid = (uint64_t)txd->txd_id << 32;
4909	int rv;
4910
4911	memset(&rndis, 0, sizeof(rndis));
4912	rndis.nvs_type = HVN_NVS_TYPE_RNDIS;
4913	rndis.nvs_rndis_mtype = HVN_NVS_RNDIS_MTYPE_DATA;
4914	rndis.nvs_chim_idx = txd->txd_chim_index;
4915	rndis.nvs_chim_sz = txd->txd_chim_size;
4916
4917	rv = vmbus_channel_send(txr->txr_chan, &rndis, sizeof(rndis),
4918	    rid, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC);
4919	if (rv) {
4920		DPRINTF("%s: RNDIS chimney data send error %d: idx %u, sz %u\n",
4921		    device_xname(sc->sc_dev), rv, rndis.nvs_chim_idx,
4922		    rndis.nvs_chim_sz);
4923		return rv;
4924	}
4925	return 0;
4926}
4927
4928static void
4929hvn_rndis_status(struct hvn_softc *sc, uint8_t *buf, uint32_t len)
4930{
4931	uint32_t status;
4932
4933	memcpy(&status, buf + RNDIS_HEADER_OFFSET, sizeof(status));
4934	switch (status) {
4935	case RNDIS_STATUS_MEDIA_CONNECT:
4936	case RNDIS_STATUS_MEDIA_DISCONNECT:
4937		hvn_link_event(sc, HVN_LINK_EV_STATE_CHANGE);
4938		break;
4939	case RNDIS_STATUS_NETWORK_CHANGE:
4940		hvn_link_event(sc, HVN_LINK_EV_NETWORK_CHANGE);
4941		break;
4942	/* Ignore these */
4943	case RNDIS_STATUS_OFFLOAD_CURRENT_CONFIG:
4944	case RNDIS_STATUS_LINK_SPEED_CHANGE:
4945		return;
4946	default:
4947		DPRINTF("%s: unhandled status %#x\n", device_xname(sc->sc_dev),
4948		    status);
4949		return;
4950	}
4951}
4952
4953static int
4954hvn_rndis_query(struct hvn_softc *sc, uint32_t oid, void *res, size_t *length)
4955{
4956
4957	return hvn_rndis_query2(sc, oid, NULL, 0, res, length, 0);
4958}
4959
4960static int
4961hvn_rndis_query2(struct hvn_softc *sc, uint32_t oid, const void *idata,
4962    size_t idlen, void *odata, size_t *odlen, size_t min_odlen)
4963{
4964	struct rndis_cmd *rc;
4965	struct rndis_query_req *req;
4966	struct rndis_query_comp *cmp;
4967	size_t olength = *odlen;
4968	int rv;
4969
4970	rc = hvn_alloc_cmd(sc);
4971
4972	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
4973	    BUS_DMASYNC_PREREAD);
4974
4975	rc->rc_id = atomic_inc_uint_nv(&sc->sc_rndisrid);
4976
4977	req = rc->rc_req;
4978	req->rm_type = REMOTE_NDIS_QUERY_MSG;
4979	req->rm_len = sizeof(*req) + idlen;
4980	req->rm_rid = rc->rc_id;
4981	req->rm_oid = oid;
4982	req->rm_infobufoffset = sizeof(*req) - RNDIS_HEADER_OFFSET;
4983	if (idlen > 0) {
4984		KASSERT(sizeof(*req) + idlen <= PAGE_SIZE);
4985		req->rm_infobuflen = idlen;
4986		memcpy(req + 1, idata, idlen);
4987	}
4988
4989	rc->rc_cmplen = sizeof(*cmp);
4990
4991	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
4992	    BUS_DMASYNC_PREWRITE);
4993
4994	if ((rv = hvn_rndis_cmd(sc, rc, 0)) != 0) {
4995		DPRINTF("%s: QUERY_MSG failed, error %d\n",
4996		    device_xname(sc->sc_dev), rv);
4997		hvn_free_cmd(sc, rc);
4998		return rv;
4999	}
5000
5001	cmp = (struct rndis_query_comp *)&rc->rc_cmp;
5002	switch (cmp->rm_status) {
5003	case RNDIS_STATUS_SUCCESS:
5004		if (cmp->rm_infobuflen > olength ||
5005		    (min_odlen > 0 && cmp->rm_infobuflen < min_odlen)) {
5006			rv = EINVAL;
5007			break;
5008		}
5009		memcpy(odata, rc->rc_cmpbuf, cmp->rm_infobuflen);
5010		*odlen = cmp->rm_infobuflen;
5011		break;
5012	default:
5013		*odlen = 0;
5014		rv = EIO;
5015		break;
5016	}
5017
5018	hvn_free_cmd(sc, rc);
5019	return rv;
5020}
5021
5022static int
5023hvn_rndis_set(struct hvn_softc *sc, uint32_t oid, void *data, size_t length)
5024{
5025	struct rndis_cmd *rc;
5026	struct rndis_set_req *req;
5027	struct rndis_set_comp *cmp;
5028	int rv;
5029
5030	rc = hvn_alloc_cmd(sc);
5031
5032	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
5033	    BUS_DMASYNC_PREREAD);
5034
5035	rc->rc_id = atomic_inc_uint_nv(&sc->sc_rndisrid);
5036
5037	req = rc->rc_req;
5038	req->rm_type = REMOTE_NDIS_SET_MSG;
5039	req->rm_len = sizeof(*req) + length;
5040	req->rm_rid = rc->rc_id;
5041	req->rm_oid = oid;
5042	req->rm_infobufoffset = sizeof(*req) - RNDIS_HEADER_OFFSET;
5043
5044	rc->rc_cmplen = sizeof(*cmp);
5045
5046	if (length > 0) {
5047		KASSERT(sizeof(*req) + length < PAGE_SIZE);
5048		req->rm_infobuflen = length;
5049		memcpy(req + 1, data, length);
5050	}
5051
5052	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
5053	    BUS_DMASYNC_PREWRITE);
5054
5055	if ((rv = hvn_rndis_cmd(sc, rc, 0)) != 0) {
5056		DPRINTF("%s: SET_MSG failed, error %d\n",
5057		    device_xname(sc->sc_dev), rv);
5058		hvn_free_cmd(sc, rc);
5059		return rv;
5060	}
5061
5062	cmp = (struct rndis_set_comp *)&rc->rc_cmp;
5063	if (cmp->rm_status != RNDIS_STATUS_SUCCESS)
5064		rv = EIO;
5065
5066	hvn_free_cmd(sc, rc);
5067	return rv;
5068}
5069
5070static int
5071hvn_rndis_open(struct hvn_softc *sc)
5072{
5073	struct ifnet *ifp = SC2IFP(sc);
5074	uint32_t filter;
5075	int rv;
5076
5077	if (ifp->if_flags & IFF_PROMISC) {
5078		filter = RNDIS_PACKET_TYPE_PROMISCUOUS;
5079	} else {
5080		filter = RNDIS_PACKET_TYPE_DIRECTED;
5081		if (ifp->if_flags & IFF_BROADCAST)
5082			filter |= RNDIS_PACKET_TYPE_BROADCAST;
5083		if (ifp->if_flags & IFF_ALLMULTI)
5084			filter |= RNDIS_PACKET_TYPE_ALL_MULTICAST;
5085		else {
5086			struct ethercom *ec = &sc->sc_ec;
5087			struct ether_multi *enm;
5088			struct ether_multistep step;
5089
5090			ETHER_LOCK(ec);
5091			ETHER_FIRST_MULTI(step, ec, enm);
5092			/* TODO: support multicast list */
5093			if (enm != NULL)
5094				filter |= RNDIS_PACKET_TYPE_ALL_MULTICAST;
5095			ETHER_UNLOCK(ec);
5096		}
5097	}
5098
5099	rv = hvn_rndis_set(sc, OID_GEN_CURRENT_PACKET_FILTER,
5100	    &filter, sizeof(filter));
5101	if (rv) {
5102		DPRINTF("%s: failed to set RNDIS filter to %#x\n",
5103		    device_xname(sc->sc_dev), filter);
5104	}
5105	return rv;
5106}
5107
5108static int
5109hvn_rndis_close(struct hvn_softc *sc)
5110{
5111	uint32_t filter = 0;
5112	int rv;
5113
5114	rv = hvn_rndis_set(sc, OID_GEN_CURRENT_PACKET_FILTER,
5115	    &filter, sizeof(filter));
5116	if (rv) {
5117		DPRINTF("%s: failed to clear RNDIS filter\n",
5118		    device_xname(sc->sc_dev));
5119	}
5120	return rv;
5121}
5122
5123static void
5124hvn_rndis_detach(struct hvn_softc *sc)
5125{
5126	struct rndis_cmd *rc;
5127	struct rndis_halt_req *req;
5128	int rv;
5129
5130	rc = hvn_alloc_cmd(sc);
5131
5132	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
5133	    BUS_DMASYNC_PREREAD);
5134
5135	rc->rc_id = atomic_inc_uint_nv(&sc->sc_rndisrid);
5136
5137	req = rc->rc_req;
5138	req->rm_type = REMOTE_NDIS_HALT_MSG;
5139	req->rm_len = sizeof(*req);
5140	req->rm_rid = rc->rc_id;
5141
5142	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
5143	    BUS_DMASYNC_PREWRITE);
5144
5145	/* No RNDIS completion; rely on NVS message send completion */
5146	if ((rv = hvn_rndis_cmd(sc, rc, HVN_RNDIS_CMD_NORESP)) != 0) {
5147		DPRINTF("%s: HALT_MSG failed, error %d\n",
5148		    device_xname(sc->sc_dev), rv);
5149	}
5150	hvn_free_cmd(sc, rc);
5151}
5152
5153static void
5154hvn_init_sysctls(struct hvn_softc *sc)
5155{
5156	struct sysctllog **log;
5157	const struct sysctlnode *rnode, *cnode, *rxnode, *txnode;
5158	const char *dvname;
5159	int error;
5160
5161	log = &sc->sc_sysctllog;
5162	dvname = device_xname(sc->sc_dev);
5163
5164	error = sysctl_createv(log, 0, NULL, &rnode,
5165	    0, CTLTYPE_NODE, dvname,
5166	    SYSCTL_DESCR("hvn information and settings"),
5167	    NULL, 0, NULL, 0, CTL_HW, CTL_CREATE, CTL_EOL);
5168	if (error)
5169		goto err;
5170
5171	error = sysctl_createv(log, 0, &rnode, &cnode,
5172	    CTLFLAG_READWRITE, CTLTYPE_BOOL, "txrx_workqueue",
5173	    SYSCTL_DESCR("Use workqueue for packet processing"),
5174	    NULL, 0, &sc->sc_txrx_workqueue, 0, CTL_CREATE, CTL_EOL);
5175	if (error)
5176		goto out;
5177
5178	error = sysctl_createv(log, 0, &rnode, &rxnode,
5179	    0, CTLTYPE_NODE, "rx",
5180	    SYSCTL_DESCR("hvn information and settings for Rx"),
5181	    NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL);
5182	if (error)
5183		goto out;
5184
5185	error = sysctl_createv(log, 0, &rxnode, NULL,
5186	    CTLFLAG_READWRITE, CTLTYPE_INT, "intr_process_limit",
5187	    SYSCTL_DESCR("max number of Rx packets"
5188	      " to process for interrupt processing"),
5189	    NULL, 0, &sc->sc_rx_intr_process_limit, 0, CTL_CREATE, CTL_EOL);
5190	if (error)
5191		goto out;
5192
5193	error = sysctl_createv(log, 0, &rxnode, NULL,
5194	    CTLFLAG_READWRITE, CTLTYPE_INT, "process_limit",
5195	    SYSCTL_DESCR("max number of Rx packets"
5196	      " to process for deferred processing"),
5197	    NULL, 0, &sc->sc_rx_process_limit, 0, CTL_CREATE, CTL_EOL);
5198	if (error)
5199		goto out;
5200
5201	error = sysctl_createv(log, 0, &rnode, &txnode,
5202	    0, CTLTYPE_NODE, "tx",
5203	    SYSCTL_DESCR("hvn information and settings for Tx"),
5204	    NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL);
5205	if (error)
5206		goto out;
5207
5208	error = sysctl_createv(log, 0, &txnode, NULL,
5209	    CTLFLAG_READWRITE, CTLTYPE_INT, "intr_process_limit",
5210	    SYSCTL_DESCR("max number of Tx packets"
5211	      " to process for interrupt processing"),
5212	    NULL, 0, &sc->sc_tx_intr_process_limit, 0, CTL_CREATE, CTL_EOL);
5213	if (error)
5214		goto out;
5215
5216	error = sysctl_createv(log, 0, &txnode, NULL,
5217	    CTLFLAG_READWRITE, CTLTYPE_INT, "process_limit",
5218	    SYSCTL_DESCR("max number of Tx packets"
5219	      " to process for deferred processing"),
5220	    NULL, 0, &sc->sc_tx_process_limit, 0, CTL_CREATE, CTL_EOL);
5221	if (error)
5222		goto out;
5223
5224	return;
5225
5226out:
5227	sysctl_teardown(log);
5228	sc->sc_sysctllog = NULL;
5229err:
5230	aprint_error_dev(sc->sc_dev, "sysctl_createv failed (err = %d)\n",
5231	    error);
5232}
5233
5234SYSCTL_SETUP(sysctl_hw_hvn_setup, "sysctl hw.hvn setup")
5235{
5236	const struct sysctlnode *rnode;
5237	const struct sysctlnode *cnode;
5238	int error;
5239
5240	error = sysctl_createv(clog, 0, NULL, &rnode,
5241	    CTLFLAG_PERMANENT, CTLTYPE_NODE, "hvn",
5242	    SYSCTL_DESCR("hvn global controls"),
5243	    NULL, 0, NULL, 0, CTL_HW, CTL_CREATE, CTL_EOL);
5244	if (error)
5245		goto fail;
5246
5247	error = sysctl_createv(clog, 0, &rnode, &cnode,
5248	    CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT,
5249	    "udp_csum_fixup_mtu",
5250	    SYSCTL_DESCR("UDP checksum offloding fixup MTU"),
5251	    NULL, 0, &hvn_udpcs_fixup_mtu, sizeof(hvn_udpcs_fixup_mtu),
5252	    CTL_CREATE, CTL_EOL);
5253	if (error)
5254		goto fail;
5255
5256	error = sysctl_createv(clog, 0, &rnode, &cnode,
5257	    CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT,
5258	    "chimney_size",
5259	    SYSCTL_DESCR("Chimney send packet size limit"),
5260	    NULL, 0, &hvn_tx_chimney_size, sizeof(hvn_tx_chimney_size),
5261	    CTL_CREATE, CTL_EOL);
5262	if (error)
5263		goto fail;
5264
5265	error = sysctl_createv(clog, 0, &rnode, &cnode,
5266	    CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT,
5267	    "channel_count",
5268	    SYSCTL_DESCR("# of channels to use"),
5269	    NULL, 0, &hvn_channel_cnt, sizeof(hvn_channel_cnt),
5270	    CTL_CREATE, CTL_EOL);
5271	if (error)
5272		goto fail;
5273
5274	error = sysctl_createv(clog, 0, &rnode, &cnode,
5275	    CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT,
5276	    "tx_ring_count",
5277	    SYSCTL_DESCR("# of transmit rings to use"),
5278	    NULL, 0, &hvn_tx_ring_cnt, sizeof(hvn_tx_ring_cnt),
5279	    CTL_CREATE, CTL_EOL);
5280	if (error)
5281		goto fail;
5282
5283	return;
5284
5285fail:
5286	aprint_error("%s: sysctl_createv failed (err = %d)\n", __func__, error);
5287}
5288