1283625Sdim/*-
2283625Sdim * Copyright (c) 2010-2012 Citrix Inc.
3283625Sdim * Copyright (c) 2009-2012 Microsoft Corp.
4283625Sdim * Copyright (c) 2012 NetApp Inc.
5283625Sdim * All rights reserved.
6283625Sdim *
7283625Sdim * Redistribution and use in source and binary forms, with or without
8283625Sdim * modification, are permitted provided that the following conditions
9283625Sdim * are met:
10283625Sdim * 1. Redistributions of source code must retain the above copyright
11283625Sdim *    notice unmodified, this list of conditions, and the following
12283625Sdim *    disclaimer.
13283625Sdim * 2. Redistributions in binary form must reproduce the above copyright
14283625Sdim *    notice, this list of conditions and the following disclaimer in the
15284236Sdim *    documentation and/or other materials provided with the distribution.
16284236Sdim *
17283625Sdim * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18296417Sdim * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19283625Sdim * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20284236Sdim * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21296417Sdim * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22284236Sdim * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23284236Sdim * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24284236Sdim * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25283625Sdim * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26296417Sdim * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27296417Sdim */
28296417Sdim
29296417Sdim/*-
30296417Sdim * Copyright (c) 2004-2006 Kip Macy
31296417Sdim * All rights reserved.
32296417Sdim *
33296417Sdim * Redistribution and use in source and binary forms, with or without
34296417Sdim * modification, are permitted provided that the following conditions
35296417Sdim * are met:
36296417Sdim * 1. Redistributions of source code must retain the above copyright
37296417Sdim *    notice, this list of conditions and the following disclaimer.
38296417Sdim * 2. Redistributions in binary form must reproduce the above copyright
39296417Sdim *    notice, this list of conditions and the following disclaimer in the
40296417Sdim *    documentation and/or other materials provided with the distribution.
41296417Sdim *
42296417Sdim * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43296417Sdim * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44296417Sdim * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45284236Sdim * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46284236Sdim * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47284236Sdim * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48296417Sdim * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49283625Sdim * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50283625Sdim * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51284236Sdim * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52284236Sdim * SUCH DAMAGE.
53284236Sdim */
54284236Sdim
55284236Sdim#include <sys/cdefs.h>
56284236Sdim__FBSDID("$FreeBSD: releng/10.3/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c 295948 2016-02-24 01:30:50Z sephe $");
57284236Sdim
58296417Sdim#include "opt_inet6.h"
59296417Sdim#include "opt_inet.h"
60296417Sdim
61296417Sdim#include <sys/param.h>
62296417Sdim#include <sys/systm.h>
63296417Sdim#include <sys/sockio.h>
64296417Sdim#include <sys/mbuf.h>
65296417Sdim#include <sys/malloc.h>
66296417Sdim#include <sys/module.h>
67296417Sdim#include <sys/kernel.h>
68296417Sdim#include <sys/socket.h>
69296417Sdim#include <sys/queue.h>
70296417Sdim#include <sys/lock.h>
71296417Sdim#include <sys/sx.h>
72296417Sdim#include <sys/sysctl.h>
73296417Sdim
74296417Sdim#include <net/if.h>
75296417Sdim#include <net/if_arp.h>
76296417Sdim#include <net/ethernet.h>
77296417Sdim#include <net/if_dl.h>
78296417Sdim#include <net/if_media.h>
79296417Sdim
80296417Sdim#include <net/bpf.h>
81296417Sdim
82296417Sdim#include <net/if_types.h>
83296417Sdim#include <net/if_vlan_var.h>
84296417Sdim#include <net/if.h>
85296417Sdim
86296417Sdim#include <netinet/in_systm.h>
87296417Sdim#include <netinet/in.h>
88296417Sdim#include <netinet/ip.h>
89296417Sdim#include <netinet/if_ether.h>
90296417Sdim#include <netinet/tcp.h>
91296417Sdim#include <netinet/udp.h>
92284734Sdim#include <netinet/ip6.h>
93284734Sdim
94284734Sdim#include <vm/vm.h>
95284734Sdim#include <vm/vm_param.h>
96284734Sdim#include <vm/vm_kern.h>
97284734Sdim#include <vm/pmap.h>
98284734Sdim
99296417Sdim#include <machine/bus.h>
100296417Sdim#include <machine/resource.h>
101284734Sdim#include <machine/frame.h>
102309124Sdim#include <machine/vmparam.h>
103309124Sdim
104284734Sdim#include <sys/bus.h>
105284734Sdim#include <sys/rman.h>
106284734Sdim#include <sys/mutex.h>
107296417Sdim#include <sys/errno.h>
108296417Sdim#include <sys/types.h>
109296417Sdim#include <machine/atomic.h>
110296417Sdim
111296417Sdim#include <machine/intr_machdep.h>
112296417Sdim
113296417Sdim#include <machine/in_cksum.h>
114284236Sdim
115284236Sdim#include <dev/hyperv/include/hyperv.h>
116284236Sdim#include "hv_net_vsc.h"
117284236Sdim#include "hv_rndis.h"
118284236Sdim#include "hv_rndis_filter.h"
119284236Sdim
120284236Sdim
121284236Sdim/* Short for Hyper-V network interface */
122284236Sdim#define NETVSC_DEVNAME    "hn"
123284236Sdim
124284236Sdim/*
125284236Sdim * It looks like offset 0 of buf is reserved to hold the softc pointer.
126284236Sdim * The sc pointer evidently not needed, and is not presently populated.
127284236Sdim * The packet offset is where the netvsc_packet starts in the buffer.
128296417Sdim */
129296417Sdim#define HV_NV_SC_PTR_OFFSET_IN_BUF         0
130296417Sdim#define HV_NV_PACKET_OFFSET_IN_BUF         16
131296417Sdim
132296417Sdim/* YYY should get it from the underlying channel */
133296417Sdim#define HN_TX_DESC_CNT			512
134296417Sdim
135296417Sdim#define HN_RNDIS_MSG_LEN		\
136296417Sdim    (sizeof(rndis_msg) +		\
137296417Sdim     RNDIS_VLAN_PPI_SIZE +		\
138296417Sdim     RNDIS_TSO_PPI_SIZE +		\
139296417Sdim     RNDIS_CSUM_PPI_SIZE)
140296417Sdim#define HN_RNDIS_MSG_BOUNDARY		PAGE_SIZE
141296417Sdim#define HN_RNDIS_MSG_ALIGN		CACHE_LINE_SIZE
142296417Sdim
143296417Sdim#define HN_TX_DATA_BOUNDARY		PAGE_SIZE
144296417Sdim#define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
145296417Sdim#define HN_TX_DATA_SEGSIZE		PAGE_SIZE
146296417Sdim#define HN_TX_DATA_SEGCNT_MAX		\
147296417Sdim    (NETVSC_PACKET_MAXPAGE - HV_RF_NUM_TX_RESERVED_PAGE_BUFS)
148296417Sdim
149284236Sdimstruct hn_txdesc {
150284236Sdim	SLIST_ENTRY(hn_txdesc) link;
151284236Sdim	struct mbuf	*m;
152284236Sdim	struct hn_softc	*sc;
153284236Sdim	int		refs;
154284236Sdim	uint32_t	flags;		/* HN_TXD_FLAG_ */
155284236Sdim	netvsc_packet	netvsc_pkt;	/* XXX to be removed */
156284236Sdim
157284236Sdim	bus_dmamap_t	data_dmap;
158284236Sdim
159284236Sdim	bus_addr_t	rndis_msg_paddr;
160284236Sdim	rndis_msg	*rndis_msg;
161296417Sdim	bus_dmamap_t	rndis_msg_dmap;
162296417Sdim};
163309124Sdim
164309124Sdim#define HN_TXD_FLAG_ONLIST	0x1
165296417Sdim#define HN_TXD_FLAG_DMAMAP	0x2
166296417Sdim
167296417Sdim/*
168283625Sdim * A unified flag for all outbound check sum flags is useful,
169283625Sdim * and it helps avoiding unnecessary check sum calculation in
170283625Sdim * network forwarding scenario.
171283625Sdim */
172283625Sdim#define HV_CSUM_FOR_OUTBOUND						\
173283625Sdim    (CSUM_IP|CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP|CSUM_IP_TSO|		\
174283625Sdim    CSUM_IP_ISCSI|CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP|		\
175283625Sdim    CSUM_IP6_TSO|CSUM_IP6_ISCSI)
176283625Sdim
177283625Sdim/* XXX move to netinet/tcp_lro.h */
178283625Sdim#define HN_LRO_HIWAT_MAX				65535
179283625Sdim#define HN_LRO_HIWAT_DEF				HN_LRO_HIWAT_MAX
180283625Sdim/* YYY 2*MTU is a bit rough, but should be good enough. */
181283625Sdim#define HN_LRO_HIWAT_MTULIM(ifp)			(2 * (ifp)->if_mtu)
182283625Sdim#define HN_LRO_HIWAT_ISVALID(sc, hiwat)			\
183283625Sdim    ((hiwat) >= HN_LRO_HIWAT_MTULIM((sc)->hn_ifp) ||	\
184283625Sdim     (hiwat) <= HN_LRO_HIWAT_MAX)
185283625Sdim
186283625Sdim/*
187283625Sdim * Be aware that this sleepable mutex will exhibit WITNESS errors when
188283625Sdim * certain TCP and ARP code paths are taken.  This appears to be a
189283625Sdim * well-known condition, as all other drivers checked use a sleeping
190283625Sdim * mutex to protect their transmit paths.
191283625Sdim * Also Be aware that mutexes do not play well with semaphores, and there
192283625Sdim * is a conflicting semaphore in a certain channel code path.
193283625Sdim */
194309124Sdim#define NV_LOCK_INIT(_sc, _name) \
195309124Sdim	    mtx_init(&(_sc)->hn_lock, _name, MTX_NETWORK_LOCK, MTX_DEF)
196309124Sdim#define NV_LOCK(_sc)		mtx_lock(&(_sc)->hn_lock)
197309124Sdim#define NV_LOCK_ASSERT(_sc)	mtx_assert(&(_sc)->hn_lock, MA_OWNED)
198309124Sdim#define NV_UNLOCK(_sc)		mtx_unlock(&(_sc)->hn_lock)
199309124Sdim#define NV_LOCK_DESTROY(_sc)	mtx_destroy(&(_sc)->hn_lock)
200309124Sdim
201309124Sdim
202309124Sdim/*
203309124Sdim * Globals
204309124Sdim */
205309124Sdim
206309124Sdimint hv_promisc_mode = 0;    /* normal mode by default */
207309124Sdim
208309124Sdim/* Trust tcp segements verification on host side. */
209309124Sdimstatic int hn_trust_hosttcp = 0;
210309124SdimTUNABLE_INT("dev.hn.trust_hosttcp", &hn_trust_hosttcp);
211309124Sdim
212309124Sdim#if __FreeBSD_version >= 1100045
213309124Sdim/* Limit TSO burst size */
214309124Sdimstatic int hn_tso_maxlen = 0;
215309124SdimTUNABLE_INT("dev.hn.tso_maxlen", &hn_tso_maxlen);
216309124Sdim#endif
217309124Sdim
218309124Sdim/* Limit chimney send size */
219309124Sdimstatic int hn_tx_chimney_size = 0;
220309124SdimTUNABLE_INT("dev.hn.tx_chimney_size", &hn_tx_chimney_size);
221309124Sdim
222309124Sdim/*
223309124Sdim * Forward declarations
224309124Sdim */
225309124Sdimstatic void hn_stop(hn_softc_t *sc);
226309124Sdimstatic void hn_ifinit_locked(hn_softc_t *sc);
227309124Sdimstatic void hn_ifinit(void *xsc);
228309124Sdimstatic int  hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
229309124Sdimstatic void hn_start_locked(struct ifnet *ifp);
230309124Sdimstatic void hn_start(struct ifnet *ifp);
231309124Sdimstatic int hn_ifmedia_upd(struct ifnet *ifp);
232309124Sdimstatic void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr);
233309124Sdim#ifdef HN_LRO_HIWAT
234309124Sdimstatic int hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS);
235309124Sdim#endif
236309124Sdimstatic int hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS);
237309124Sdimstatic int hn_check_iplen(const struct mbuf *, int);
238309124Sdimstatic int hn_create_tx_ring(struct hn_softc *sc);
239309124Sdimstatic void hn_destroy_tx_ring(struct hn_softc *sc);
240309124Sdim
241309124Sdimstatic __inline void
242309124Sdimhn_set_lro_hiwat(struct hn_softc *sc, int hiwat)
243309124Sdim{
244309124Sdim	sc->hn_lro_hiwat = hiwat;
245309124Sdim#ifdef HN_LRO_HIWAT
246309124Sdim	sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat;
247309124Sdim#endif
248309124Sdim}
249284236Sdim
250284236Sdim/*
251284236Sdim * NetVsc get message transport protocol type
252284236Sdim */
253284236Sdimstatic uint32_t get_transport_proto_type(struct mbuf *m_head)
254284236Sdim{
255284236Sdim	uint32_t ret_val = TRANSPORT_TYPE_NOT_IP;
256284236Sdim	uint16_t ether_type = 0;
257284236Sdim	int ether_len = 0;
258284236Sdim	struct ether_vlan_header *eh;
259284236Sdim#ifdef INET
260284236Sdim	struct ip *iph;
261284236Sdim#endif
262284236Sdim#ifdef INET6
263284236Sdim	struct ip6_hdr *ip6;
264284236Sdim#endif
265284236Sdim
266284236Sdim	eh = mtod(m_head, struct ether_vlan_header*);
267283625Sdim	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
268283625Sdim		ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
269283625Sdim		ether_type = eh->evl_proto;
270283625Sdim	} else {
271283625Sdim		ether_len = ETHER_HDR_LEN;
272283625Sdim		ether_type = eh->evl_encap_proto;
273283625Sdim	}
274283625Sdim
275283625Sdim	switch (ntohs(ether_type)) {
276283625Sdim#ifdef INET6
277283625Sdim	case ETHERTYPE_IPV6:
278283625Sdim		ip6 = (struct ip6_hdr *)(m_head->m_data + ether_len);
279283625Sdim
280283625Sdim		if (IPPROTO_TCP == ip6->ip6_nxt) {
281283625Sdim			ret_val = TRANSPORT_TYPE_IPV6_TCP;
282283625Sdim		} else if (IPPROTO_UDP == ip6->ip6_nxt) {
283283625Sdim			ret_val = TRANSPORT_TYPE_IPV6_UDP;
284283625Sdim		}
285283625Sdim		break;
286283625Sdim#endif
287283625Sdim#ifdef INET
288283625Sdim	case ETHERTYPE_IP:
289283625Sdim		iph = (struct ip *)(m_head->m_data + ether_len);
290283625Sdim
291283625Sdim		if (IPPROTO_TCP == iph->ip_p) {
292283625Sdim			ret_val = TRANSPORT_TYPE_IPV4_TCP;
293283625Sdim		} else if (IPPROTO_UDP == iph->ip_p) {
294283625Sdim			ret_val = TRANSPORT_TYPE_IPV4_UDP;
295283625Sdim		}
296283625Sdim		break;
297283625Sdim#endif
298283625Sdim	default:
299283625Sdim		ret_val = TRANSPORT_TYPE_NOT_IP;
300283625Sdim		break;
301283625Sdim	}
302283625Sdim
303283625Sdim	return (ret_val);
304283625Sdim}
305283625Sdim
306283625Sdimstatic int
307283625Sdimhn_ifmedia_upd(struct ifnet *ifp __unused)
308283625Sdim{
309283625Sdim
310283625Sdim	return EOPNOTSUPP;
311283625Sdim}
312283625Sdim
313283625Sdimstatic void
314284236Sdimhn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
315283625Sdim{
316284236Sdim	struct hn_softc *sc = ifp->if_softc;
317284236Sdim
318284236Sdim	ifmr->ifm_status = IFM_AVALID;
319284236Sdim	ifmr->ifm_active = IFM_ETHER;
320284236Sdim
321284236Sdim	if (!sc->hn_carrier) {
322283625Sdim		ifmr->ifm_active |= IFM_NONE;
323283625Sdim		return;
324283625Sdim	}
325283625Sdim	ifmr->ifm_status |= IFM_ACTIVE;
326283625Sdim	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
327283625Sdim}
328283625Sdim
329283625Sdim/* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
330283625Sdimstatic const hv_guid g_net_vsc_device_type = {
331283625Sdim	.data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
332283625Sdim		0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
333296417Sdim};
334296417Sdim
335296417Sdim/*
336296417Sdim * Standard probe entry point.
337296417Sdim *
338296417Sdim */
339296417Sdimstatic int
340296417Sdimnetvsc_probe(device_t dev)
341296417Sdim{
342296417Sdim	const char *p;
343296417Sdim
344296417Sdim	p = vmbus_get_type(dev);
345296417Sdim	if (!memcmp(p, &g_net_vsc_device_type.data, sizeof(hv_guid))) {
346296417Sdim		device_set_desc(dev, "Synthetic Network Interface");
347296417Sdim		if (bootverbose)
348296417Sdim			printf("Netvsc probe... DONE \n");
349296417Sdim
350296417Sdim		return (BUS_PROBE_DEFAULT);
351296417Sdim	}
352284236Sdim
353284236Sdim	return (ENXIO);
354284236Sdim}
355284236Sdim
356284236Sdim/*
357284236Sdim * Standard attach entry point.
358284236Sdim *
359284236Sdim * Called when the driver is loaded.  It allocates needed resources,
360284236Sdim * and initializes the "hardware" and software.
361284236Sdim */
362284236Sdimstatic int
363284236Sdimnetvsc_attach(device_t dev)
364284236Sdim{
365284236Sdim	struct hv_device *device_ctx = vmbus_get_devctx(dev);
366284236Sdim	netvsc_device_info device_info;
367283625Sdim	hn_softc_t *sc;
368283625Sdim	int unit = device_get_unit(dev);
369283625Sdim	struct ifnet *ifp = NULL;
370283625Sdim	struct sysctl_oid_list *child;
371283625Sdim	struct sysctl_ctx_list *ctx;
372283625Sdim	int error;
373283625Sdim#if __FreeBSD_version >= 1100045
374283625Sdim	int tso_maxlen;
375284236Sdim#endif
376284236Sdim
377284236Sdim	sc = device_get_softc(dev);
378284236Sdim	if (sc == NULL) {
379284236Sdim		return (ENOMEM);
380284236Sdim	}
381284236Sdim
382284236Sdim	bzero(sc, sizeof(hn_softc_t));
383284236Sdim	sc->hn_unit = unit;
384284236Sdim	sc->hn_dev = dev;
385284236Sdim	sc->hn_lro_hiwat = HN_LRO_HIWAT_DEF;
386284236Sdim	sc->hn_trust_hosttcp = hn_trust_hosttcp;
387284236Sdim
388284236Sdim	error = hn_create_tx_ring(sc);
389284236Sdim	if (error)
390284236Sdim		goto failed;
391284236Sdim
392284236Sdim	NV_LOCK_INIT(sc, "NetVSCLock");
393284236Sdim
394284236Sdim	sc->hn_dev_obj = device_ctx;
395284236Sdim
396284236Sdim	ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
397284236Sdim	ifp->if_softc = sc;
398284236Sdim
399284236Sdim	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
400284236Sdim	ifp->if_dunit = unit;
401283625Sdim	ifp->if_dname = NETVSC_DEVNAME;
402283625Sdim
403283625Sdim	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
404283625Sdim	ifp->if_ioctl = hn_ioctl;
405283625Sdim	ifp->if_start = hn_start;
406283625Sdim	ifp->if_init = hn_ifinit;
407283625Sdim	/* needed by hv_rf_on_device_add() code */
408296417Sdim	ifp->if_mtu = ETHERMTU;
409296417Sdim	IFQ_SET_MAXLEN(&ifp->if_snd, 512);
410296417Sdim	ifp->if_snd.ifq_drv_maxlen = 511;
411296417Sdim	IFQ_SET_READY(&ifp->if_snd);
412296417Sdim
413296417Sdim	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
414296417Sdim	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
415284236Sdim	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
416284236Sdim	/* XXX ifmedia_set really should do this for us */
417284236Sdim	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
418284236Sdim
419284236Sdim	/*
420284236Sdim	 * Tell upper layers that we support full VLAN capability.
421284236Sdim	 */
422284236Sdim	ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header);
423284236Sdim	ifp->if_capabilities |=
424284236Sdim	    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO |
425284236Sdim	    IFCAP_LRO;
426284236Sdim	ifp->if_capenable |=
427283625Sdim	    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO |
428283625Sdim	    IFCAP_LRO;
429283625Sdim	/*
430283625Sdim	 * Only enable UDP checksum offloading when it is on 2012R2 or
431283625Sdim	 * later. UDP checksum offloading doesn't work on earlier
432283625Sdim	 * Windows releases.
433283625Sdim	 */
434296417Sdim	if (hv_vmbus_protocal_version >= HV_VMBUS_VERSION_WIN8_1)
435296417Sdim		ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
436296417Sdim	else
437296417Sdim		ifp->if_hwassist = CSUM_TCP | CSUM_TSO;
438296417Sdim
439284236Sdim	error = hv_rf_on_device_add(device_ctx, &device_info);
440284236Sdim	if (error)
441284236Sdim		goto failed;
442284236Sdim
443284236Sdim	if (device_info.link_state == 0) {
444284236Sdim		sc->hn_carrier = 1;
445284236Sdim	}
446284236Sdim
447284236Sdim#if defined(INET) || defined(INET6)
448283625Sdim	tcp_lro_init(&sc->hn_lro);
449283625Sdim	/* Driver private LRO settings */
450283625Sdim	sc->hn_lro.ifp = ifp;
451283625Sdim#ifdef HN_LRO_HIWAT
452283625Sdim	sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat;
453283625Sdim#endif
454283625Sdim#endif	/* INET || INET6 */
455283625Sdim
456283625Sdim#if __FreeBSD_version >= 1100045
457283625Sdim	tso_maxlen = hn_tso_maxlen;
458283625Sdim	if (tso_maxlen <= 0 || tso_maxlen > IP_MAXPACKET)
459283625Sdim		tso_maxlen = IP_MAXPACKET;
460309124Sdim
461309124Sdim	ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
462309124Sdim	ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
463309124Sdim	ifp->if_hw_tsomax = tso_maxlen -
464309124Sdim	    (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
465309124Sdim#endif
466296417Sdim
467296417Sdim	ether_ifattach(ifp, device_info.mac_addr);
468296417Sdim
469296417Sdim#if __FreeBSD_version >= 1100045
470296417Sdim	if_printf(ifp, "TSO: %u/%u/%u\n", ifp->if_hw_tsomax,
471296417Sdim	    ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
472283625Sdim#endif
473296417Sdim
474296417Sdim	sc->hn_tx_chimney_max = sc->net_dev->send_section_size;
475296417Sdim	sc->hn_tx_chimney_size = sc->hn_tx_chimney_max;
476296417Sdim	if (hn_tx_chimney_size > 0 &&
477283625Sdim	    hn_tx_chimney_size < sc->hn_tx_chimney_max)
478296417Sdim		sc->hn_tx_chimney_size = hn_tx_chimney_size;
479309124Sdim
480309124Sdim	ctx = device_get_sysctl_ctx(dev);
481309124Sdim	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
482296417Sdim
483296417Sdim	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_queued",
484283625Sdim	    CTLFLAG_RW, &sc->hn_lro.lro_queued, 0, "LRO queued");
485296417Sdim	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_flushed",
486296417Sdim	    CTLFLAG_RW, &sc->hn_lro.lro_flushed, 0, "LRO flushed");
487296417Sdim	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "lro_tried",
488283625Sdim	    CTLFLAG_RW, &sc->hn_lro_tried, "# of LRO tries");
489283625Sdim#ifdef HN_LRO_HIWAT
490283625Sdim	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_hiwat",
491283625Sdim	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_hiwat_sysctl,
492283625Sdim	    "I", "LRO high watermark");
493283625Sdim#endif
494283625Sdim	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "trust_hosttcp",
495283625Sdim	    CTLFLAG_RW, &sc->hn_trust_hosttcp, 0,
496283625Sdim	    "Trust tcp segement verification on host side, "
497283625Sdim	    "when csum info is missing");
498283625Sdim	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_ip",
499283625Sdim	    CTLFLAG_RW, &sc->hn_csum_ip, "RXCSUM IP");
500283625Sdim	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_tcp",
501283625Sdim	    CTLFLAG_RW, &sc->hn_csum_tcp, "RXCSUM TCP");
502284236Sdim	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_trusted",
503284236Sdim	    CTLFLAG_RW, &sc->hn_csum_trusted,
504284236Sdim	    "# of TCP segements that we trust host's csum verification");
505284236Sdim	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "small_pkts",
506284236Sdim	    CTLFLAG_RW, &sc->hn_small_pkts, "# of small packets received");
507284236Sdim	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "no_txdescs",
508284236Sdim	    CTLFLAG_RW, &sc->hn_no_txdescs, "# of times short of TX descs");
509284236Sdim	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "send_failed",
510284236Sdim	    CTLFLAG_RW, &sc->hn_send_failed, "# of hyper-v sending failure");
511284236Sdim	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "txdma_failed",
512284236Sdim	    CTLFLAG_RW, &sc->hn_txdma_failed, "# of TX DMA failure");
513284236Sdim	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_collapsed",
514284236Sdim	    CTLFLAG_RW, &sc->hn_tx_collapsed, "# of TX mbuf collapsed");
515284236Sdim	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_chimney",
516284236Sdim	    CTLFLAG_RW, &sc->hn_tx_chimney, "# of chimney send");
517284236Sdim	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
518284236Sdim	    CTLFLAG_RD, &sc->hn_txdesc_cnt, 0, "# of total TX descs");
519284236Sdim	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
520284236Sdim	    CTLFLAG_RD, &sc->hn_txdesc_avail, 0, "# of available TX descs");
521284236Sdim	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
522284236Sdim	    CTLFLAG_RD, &sc->hn_tx_chimney_max, 0,
523284236Sdim	    "Chimney send packet size upper boundary");
524284236Sdim	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
525284236Sdim	    CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_tx_chimney_size_sysctl,
526284236Sdim	    "I", "Chimney send packet size limit");
527284236Sdim
528283625Sdim	if (unit == 0) {
529283625Sdim		struct sysctl_ctx_list *dc_ctx;
530283625Sdim		struct sysctl_oid_list *dc_child;
531283625Sdim		devclass_t dc;
532283625Sdim
533283625Sdim		/*
534283625Sdim		 * Add sysctl nodes for devclass
535283625Sdim		 */
536283625Sdim		dc = device_get_devclass(dev);
537283625Sdim		dc_ctx = devclass_get_sysctl_ctx(dc);
538283625Sdim		dc_child = SYSCTL_CHILDREN(devclass_get_sysctl_tree(dc));
539283625Sdim
540283625Sdim		SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "trust_hosttcp",
541283625Sdim		    CTLFLAG_RD, &hn_trust_hosttcp, 0,
542283625Sdim		    "Trust tcp segement verification on host side, "
543283625Sdim		    "when csum info is missing (global setting)");
544284236Sdim		SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "tx_chimney_size",
545284236Sdim		    CTLFLAG_RD, &hn_tx_chimney_size, 0,
546284236Sdim		    "Chimney send packet size limit");
547284236Sdim#if __FreeBSD_version >= 1100045
548283625Sdim		SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "tso_maxlen",
549283625Sdim		    CTLFLAG_RD, &hn_tso_maxlen, 0, "TSO burst limit");
550284236Sdim#endif
551284236Sdim	}
552284236Sdim
553284236Sdim	return (0);
554283625Sdimfailed:
555283625Sdim	hn_destroy_tx_ring(sc);
556296417Sdim	if (ifp != NULL)
557296417Sdim		if_free(ifp);
558296417Sdim	return (error);
559296417Sdim}
560296417Sdim
561296417Sdim/*
562296417Sdim * Standard detach entry point
563296417Sdim */
564296417Sdimstatic int
565296417Sdimnetvsc_detach(device_t dev)
566296417Sdim{
567296417Sdim	struct hn_softc *sc = device_get_softc(dev);
568296417Sdim	struct hv_device *hv_device = vmbus_get_devctx(dev);
569284236Sdim
570284236Sdim	if (bootverbose)
571284236Sdim		printf("netvsc_detach\n");
572284236Sdim
573284236Sdim	/*
574284236Sdim	 * XXXKYS:  Need to clean up all our
575284236Sdim	 * driver state; this is the driver
576284236Sdim	 * unloading.
577284236Sdim	 */
578284236Sdim
579284236Sdim	/*
580283625Sdim	 * XXXKYS:  Need to stop outgoing traffic and unregister
581283625Sdim	 * the netdevice.
582283625Sdim	 */
583283625Sdim
584283625Sdim	hv_rf_on_device_remove(hv_device, HV_RF_NV_DESTROY_CHANNEL);
585283625Sdim
586283625Sdim	ifmedia_removeall(&sc->hn_media);
587283625Sdim#if defined(INET) || defined(INET6)
588283625Sdim	tcp_lro_free(&sc->hn_lro);
589296417Sdim#endif
590296417Sdim	hn_destroy_tx_ring(sc);
591296417Sdim
592296417Sdim	return (0);
593296417Sdim}
594296417Sdim
595296417Sdim/*
596296417Sdim * Standard shutdown entry point
597296417Sdim */
598296417Sdimstatic int
599296417Sdimnetvsc_shutdown(device_t dev)
600296417Sdim{
601309124Sdim	return (0);
602309124Sdim}
603309124Sdim
604309124Sdimstatic __inline int
605309124Sdimhn_txdesc_dmamap_load(struct hn_softc *sc, struct hn_txdesc *txd,
606309124Sdim    struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
607309124Sdim{
608309124Sdim	struct mbuf *m = *m_head;
609309124Sdim	int error;
610309124Sdim
611309124Sdim	error = bus_dmamap_load_mbuf_sg(sc->hn_tx_data_dtag, txd->data_dmap,
612309124Sdim	    m, segs, nsegs, BUS_DMA_NOWAIT);
613309124Sdim	if (error == EFBIG) {
614309124Sdim		struct mbuf *m_new;
615309124Sdim
616309124Sdim		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
617309124Sdim		if (m_new == NULL)
618309124Sdim			return ENOBUFS;
619309124Sdim		else
620309124Sdim			*m_head = m = m_new;
621309124Sdim		sc->hn_tx_collapsed++;
622309124Sdim
623309124Sdim		error = bus_dmamap_load_mbuf_sg(sc->hn_tx_data_dtag,
624309124Sdim		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
625309124Sdim	}
626309124Sdim	if (!error) {
627309124Sdim		bus_dmamap_sync(sc->hn_tx_data_dtag, txd->data_dmap,
628309124Sdim		    BUS_DMASYNC_PREWRITE);
629309124Sdim		txd->flags |= HN_TXD_FLAG_DMAMAP;
630309124Sdim	}
631309124Sdim	return error;
632309124Sdim}
633309124Sdim
634309124Sdimstatic __inline void
635309124Sdimhn_txdesc_dmamap_unload(struct hn_softc *sc, struct hn_txdesc *txd)
636309124Sdim{
637309124Sdim
638309124Sdim	if (txd->flags & HN_TXD_FLAG_DMAMAP) {
639309124Sdim		bus_dmamap_sync(sc->hn_tx_data_dtag,
640309124Sdim		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
641309124Sdim		bus_dmamap_unload(sc->hn_tx_data_dtag,
642309124Sdim		    txd->data_dmap);
643309124Sdim		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
644309124Sdim	}
645309124Sdim}
646309124Sdim
647309124Sdimstatic __inline int
648309124Sdimhn_txdesc_put(struct hn_softc *sc, struct hn_txdesc *txd)
649309124Sdim{
650309124Sdim
651309124Sdim	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
652309124Sdim	    ("put an onlist txd %#x", txd->flags));
653309124Sdim
654309124Sdim	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
655309124Sdim	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
656309124Sdim		return 0;
657309124Sdim
658309124Sdim	hn_txdesc_dmamap_unload(sc, txd);
659309124Sdim	if (txd->m != NULL) {
660309124Sdim		m_freem(txd->m);
661284236Sdim		txd->m = NULL;
662284236Sdim	}
663284236Sdim
664283625Sdim	txd->flags |= HN_TXD_FLAG_ONLIST;
665283625Sdim
666284236Sdim	mtx_lock_spin(&sc->hn_txlist_spin);
667284236Sdim	KASSERT(sc->hn_txdesc_avail >= 0 &&
668284236Sdim	    sc->hn_txdesc_avail < sc->hn_txdesc_cnt,
669284236Sdim	    ("txdesc_put: invalid txd avail %d", sc->hn_txdesc_avail));
670283625Sdim	sc->hn_txdesc_avail++;
671283625Sdim	SLIST_INSERT_HEAD(&sc->hn_txlist, txd, link);
672296417Sdim	mtx_unlock_spin(&sc->hn_txlist_spin);
673296417Sdim
674296417Sdim	return 1;
675296417Sdim}
676296417Sdim
677296417Sdimstatic __inline struct hn_txdesc *
678296417Sdimhn_txdesc_get(struct hn_softc *sc)
679296417Sdim{
680296417Sdim	struct hn_txdesc *txd;
681296417Sdim
682296417Sdim	mtx_lock_spin(&sc->hn_txlist_spin);
683296417Sdim	txd = SLIST_FIRST(&sc->hn_txlist);
684296417Sdim	if (txd != NULL) {
685296417Sdim		KASSERT(sc->hn_txdesc_avail > 0,
686296417Sdim		    ("txdesc_get: invalid txd avail %d", sc->hn_txdesc_avail));
687296417Sdim		sc->hn_txdesc_avail--;
688296417Sdim		SLIST_REMOVE_HEAD(&sc->hn_txlist, link);
689296417Sdim	}
690296417Sdim	mtx_unlock_spin(&sc->hn_txlist_spin);
691296417Sdim
692296417Sdim	if (txd != NULL) {
693309124Sdim		KASSERT(txd->m == NULL && txd->refs == 0 &&
694309124Sdim		    (txd->flags & HN_TXD_FLAG_ONLIST), ("invalid txd"));
695309124Sdim		txd->flags &= ~HN_TXD_FLAG_ONLIST;
696309124Sdim		txd->refs = 1;
697309124Sdim	}
698309124Sdim	return txd;
699309124Sdim}
700309124Sdim
701309124Sdimstatic __inline void
702309124Sdimhn_txdesc_hold(struct hn_txdesc *txd)
703309124Sdim{
704309124Sdim
705309124Sdim	/* 0->1 transition will never work */
706309124Sdim	KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
707309124Sdim	atomic_add_int(&txd->refs, 1);
708309124Sdim}
709309124Sdim
710309124Sdim/*
711309124Sdim * Send completion processing
712296417Sdim *
713284236Sdim * Note:  It looks like offset 0 of buf is reserved to hold the softc
714284236Sdim * pointer.  The sc pointer is not currently needed in this function, and
715284236Sdim * it is not presently populated by the TX function.
716284236Sdim */
717284236Sdimvoid
718284236Sdimnetvsc_xmit_completion(void *context)
719284236Sdim{
720296417Sdim	netvsc_packet *packet = context;
721283625Sdim	struct hn_txdesc *txd;
722283625Sdim	struct hn_softc *sc;
723284236Sdim
724284236Sdim	txd = (struct hn_txdesc *)(uintptr_t)
725284236Sdim	    packet->compl.send.send_completion_tid;
726284236Sdim
727284236Sdim	sc = txd->sc;
728284236Sdim	sc->hn_txeof = 1;
729284236Sdim	hn_txdesc_put(sc, txd);
730284236Sdim}
731284236Sdim
732284236Sdimvoid
733284236Sdimnetvsc_channel_rollup(struct hv_device *device_ctx)
734284236Sdim{
735284236Sdim	struct hn_softc *sc = device_get_softc(device_ctx->device);
736284236Sdim	struct ifnet *ifp;
737284236Sdim
738284236Sdim	if (!sc->hn_txeof)
739284236Sdim		return;
740284236Sdim
741283625Sdim	sc->hn_txeof = 0;
742284236Sdim	ifp = sc->hn_ifp;
743284236Sdim	NV_LOCK(sc);
744284236Sdim	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
745284236Sdim	hn_start_locked(ifp);
746284236Sdim	NV_UNLOCK(sc);
747284236Sdim}
748284236Sdim
749284236Sdim/*
750284236Sdim * Start a transmit of one or more packets
751284236Sdim */
752284236Sdimstatic void
753284236Sdimhn_start_locked(struct ifnet *ifp)
754284236Sdim{
755283625Sdim	hn_softc_t *sc = ifp->if_softc;
756284236Sdim	struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
757284236Sdim	netvsc_dev *net_dev = sc->net_dev;
758284236Sdim	netvsc_packet *packet;
759284236Sdim	struct mbuf *m_head, *m;
760284236Sdim	struct ether_vlan_header *eh;
761284236Sdim	rndis_msg *rndis_mesg;
762284236Sdim	rndis_packet *rndis_pkt;
763296417Sdim	rndis_per_packet_info *rppi;
764296417Sdim	ndis_8021q_info *rppi_vlan_info;
765296417Sdim	rndis_tcp_ip_csum_info *csum_info;
766296417Sdim	rndis_tcp_tso_info *tso_info;
767296417Sdim	int ether_len;
768296417Sdim	uint32_t rndis_msg_size = 0;
769296417Sdim	uint32_t trans_proto_type;
770296417Sdim	uint32_t send_buf_section_idx =
771296417Sdim	    NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX;
772296417Sdim
773296417Sdim	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
774296417Sdim	    IFF_DRV_RUNNING)
775296417Sdim		return;
776309124Sdim
777309124Sdim	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
778309124Sdim		bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
779309124Sdim		int error, nsegs, i, send_failed = 0;
780309124Sdim		struct hn_txdesc *txd;
781309124Sdim
782309124Sdim		IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
783309124Sdim		if (m_head == NULL)
784309124Sdim			break;
785296417Sdim
786284236Sdim		txd = hn_txdesc_get(sc);
787284236Sdim		if (txd == NULL) {
788284236Sdim			sc->hn_no_txdescs++;
789284236Sdim			IF_PREPEND(&ifp->if_snd, m_head);
790284236Sdim			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
791309124Sdim			break;
792309124Sdim		}
793309124Sdim
794309124Sdim		packet = &txd->netvsc_pkt;
795309124Sdim		/* XXX not necessary */
796309124Sdim		memset(packet, 0, sizeof(*packet));
797309124Sdim
798309124Sdim		packet->is_data_pkt = TRUE;
799309124Sdim
800309124Sdim		/* Initialize it from the mbuf */
801309124Sdim		packet->tot_data_buf_len = m_head->m_pkthdr.len;
802309124Sdim
803285181Sdim		/*
804309124Sdim		 * extension points to the area reserved for the
805		 * rndis_filter_packet, which is placed just after
806		 * the netvsc_packet (and rppi struct, if present;
807		 * length is updated later).
808		 */
809		rndis_mesg = txd->rndis_msg;
810		/* XXX not necessary */
811		memset(rndis_mesg, 0, HN_RNDIS_MSG_LEN);
812		rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG;
813
814		rndis_pkt = &rndis_mesg->msg.packet;
815		rndis_pkt->data_offset = sizeof(rndis_packet);
816		rndis_pkt->data_length = packet->tot_data_buf_len;
817		rndis_pkt->per_pkt_info_offset = sizeof(rndis_packet);
818
819		rndis_msg_size = RNDIS_MESSAGE_SIZE(rndis_packet);
820
821		/*
822		 * If the Hyper-V infrastructure needs to embed a VLAN tag,
823		 * initialize netvsc_packet and rppi struct values as needed.
824		 */
825		if (m_head->m_flags & M_VLANTAG) {
826			/*
827			 * set up some additional fields so the Hyper-V infrastructure will stuff the VLAN tag
828			 * into the frame.
829			 */
830			rndis_msg_size += RNDIS_VLAN_PPI_SIZE;
831
832			rppi = hv_set_rppi_data(rndis_mesg, RNDIS_VLAN_PPI_SIZE,
833			    ieee_8021q_info);
834
835			/* VLAN info immediately follows rppi struct */
836			rppi_vlan_info = (ndis_8021q_info *)((char*)rppi +
837			    rppi->per_packet_info_offset);
838			/* FreeBSD does not support CFI or priority */
839			rppi_vlan_info->u1.s1.vlan_id =
840			    m_head->m_pkthdr.ether_vtag & 0xfff;
841		}
842
843		/* Only check the flags for outbound and ignore the ones for inbound */
844		if (0 == (m_head->m_pkthdr.csum_flags & HV_CSUM_FOR_OUTBOUND)) {
845			goto pre_send;
846		}
847
848		eh = mtod(m_head, struct ether_vlan_header*);
849		if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
850			ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
851		} else {
852			ether_len = ETHER_HDR_LEN;
853		}
854
855		trans_proto_type = get_transport_proto_type(m_head);
856		if (TRANSPORT_TYPE_NOT_IP == trans_proto_type) {
857			goto pre_send;
858		}
859
860		/*
861		 * TSO packet needless to setup the send side checksum
862		 * offload.
863		 */
864		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
865			goto do_tso;
866		}
867
868		/* setup checksum offload */
869		rndis_msg_size += RNDIS_CSUM_PPI_SIZE;
870		rppi = hv_set_rppi_data(rndis_mesg, RNDIS_CSUM_PPI_SIZE,
871		    tcpip_chksum_info);
872		csum_info = (rndis_tcp_ip_csum_info *)((char*)rppi +
873		    rppi->per_packet_info_offset);
874
875		if (trans_proto_type & (TYPE_IPV4 << 16)) {
876			csum_info->xmit.is_ipv4 = 1;
877		} else {
878			csum_info->xmit.is_ipv6 = 1;
879		}
880
881		if (trans_proto_type & TYPE_TCP) {
882			csum_info->xmit.tcp_csum = 1;
883			csum_info->xmit.tcp_header_offset = 0;
884		} else if (trans_proto_type & TYPE_UDP) {
885			csum_info->xmit.udp_csum = 1;
886		}
887
888		goto pre_send;
889
890do_tso:
891		/* setup TCP segmentation offload */
892		rndis_msg_size += RNDIS_TSO_PPI_SIZE;
893		rppi = hv_set_rppi_data(rndis_mesg, RNDIS_TSO_PPI_SIZE,
894		    tcp_large_send_info);
895
896		tso_info = (rndis_tcp_tso_info *)((char *)rppi +
897		    rppi->per_packet_info_offset);
898		tso_info->lso_v2_xmit.type =
899		    RNDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
900
901#ifdef INET
902		if (trans_proto_type & (TYPE_IPV4 << 16)) {
903			struct ip *ip =
904			    (struct ip *)(m_head->m_data + ether_len);
905			unsigned long iph_len = ip->ip_hl << 2;
906			struct tcphdr *th =
907			    (struct tcphdr *)((caddr_t)ip + iph_len);
908
909			tso_info->lso_v2_xmit.ip_version =
910			    RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV4;
911			ip->ip_len = 0;
912			ip->ip_sum = 0;
913
914			th->th_sum = in_pseudo(ip->ip_src.s_addr,
915			    ip->ip_dst.s_addr,
916			    htons(IPPROTO_TCP));
917		}
918#endif
919#if defined(INET6) && defined(INET)
920		else
921#endif
922#ifdef INET6
923		{
924			struct ip6_hdr *ip6 =
925			    (struct ip6_hdr *)(m_head->m_data + ether_len);
926			struct tcphdr *th = (struct tcphdr *)(ip6 + 1);
927
928			tso_info->lso_v2_xmit.ip_version =
929			    RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV6;
930			ip6->ip6_plen = 0;
931			th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
932		}
933#endif
934		tso_info->lso_v2_xmit.tcp_header_offset = 0;
935		tso_info->lso_v2_xmit.mss = m_head->m_pkthdr.tso_segsz;
936
937pre_send:
938		rndis_mesg->msg_len = packet->tot_data_buf_len + rndis_msg_size;
939		packet->tot_data_buf_len = rndis_mesg->msg_len;
940
941		/* send packet with send buffer */
942		if (packet->tot_data_buf_len < sc->hn_tx_chimney_size) {
943			send_buf_section_idx =
944			    hv_nv_get_next_send_section(net_dev);
945			if (send_buf_section_idx !=
946			    NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) {
947				char *dest = ((char *)net_dev->send_buf +
948				    send_buf_section_idx *
949				    net_dev->send_section_size);
950
951				memcpy(dest, rndis_mesg, rndis_msg_size);
952				dest += rndis_msg_size;
953				for (m = m_head; m != NULL; m = m->m_next) {
954					if (m->m_len) {
955						memcpy(dest,
956						    (void *)mtod(m, vm_offset_t),
957						    m->m_len);
958						dest += m->m_len;
959					}
960				}
961
962				packet->send_buf_section_idx =
963				    send_buf_section_idx;
964				packet->send_buf_section_size =
965				    packet->tot_data_buf_len;
966				packet->page_buf_count = 0;
967				sc->hn_tx_chimney++;
968				goto do_send;
969			}
970		}
971
972		error = hn_txdesc_dmamap_load(sc, txd, &m_head, segs, &nsegs);
973		if (error) {
974			int freed;
975
976			/*
977			 * This mbuf is not linked w/ the txd yet, so free
978			 * it now.
979			 */
980			m_freem(m_head);
981			freed = hn_txdesc_put(sc, txd);
982			KASSERT(freed != 0,
983			    ("fail to free txd upon txdma error"));
984
985			sc->hn_txdma_failed++;
986			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
987			continue;
988		}
989
990		packet->page_buf_count = nsegs +
991		    HV_RF_NUM_TX_RESERVED_PAGE_BUFS;
992
993		/* send packet with page buffer */
994		packet->page_buffers[0].pfn = atop(txd->rndis_msg_paddr);
995		packet->page_buffers[0].offset =
996		    txd->rndis_msg_paddr & PAGE_MASK;
997		packet->page_buffers[0].length = rndis_msg_size;
998
999		/*
1000		 * Fill the page buffers with mbuf info starting at index
1001		 * HV_RF_NUM_TX_RESERVED_PAGE_BUFS.
1002		 */
1003		for (i = 0; i < nsegs; ++i) {
1004			hv_vmbus_page_buffer *pb = &packet->page_buffers[
1005			    i + HV_RF_NUM_TX_RESERVED_PAGE_BUFS];
1006
1007			pb->pfn = atop(segs[i].ds_addr);
1008			pb->offset = segs[i].ds_addr & PAGE_MASK;
1009			pb->length = segs[i].ds_len;
1010		}
1011
1012		packet->send_buf_section_idx =
1013		    NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX;
1014		packet->send_buf_section_size = 0;
1015
1016do_send:
1017		txd->m = m_head;
1018
1019		/* Set the completion routine */
1020		packet->compl.send.on_send_completion = netvsc_xmit_completion;
1021		packet->compl.send.send_completion_context = packet;
1022		packet->compl.send.send_completion_tid =
1023		    (uint64_t)(uintptr_t)txd;
1024
1025again:
1026		/*
1027		 * Make sure that txd is not freed before ETHER_BPF_MTAP.
1028		 */
1029		hn_txdesc_hold(txd);
1030		error = hv_nv_on_send(device_ctx, packet);
1031		if (!error) {
1032			ETHER_BPF_MTAP(ifp, m_head);
1033			if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
1034		}
1035		hn_txdesc_put(sc, txd);
1036
1037		if (__predict_false(error)) {
1038			int freed;
1039
1040			/*
1041			 * This should "really rarely" happen.
1042			 *
1043			 * XXX Too many RX to be acked or too many sideband
1044			 * commands to run?  Ask netvsc_channel_rollup()
1045			 * to kick start later.
1046			 */
1047			sc->hn_txeof = 1;
1048			if (!send_failed) {
1049				sc->hn_send_failed++;
1050				send_failed = 1;
1051				/*
1052				 * Try sending again after set hn_txeof;
1053				 * in case that we missed the last
1054				 * netvsc_channel_rollup().
1055				 */
1056				goto again;
1057			}
1058			if_printf(ifp, "send failed\n");
1059
1060			/*
1061			 * This mbuf will be prepended, don't free it
1062			 * in hn_txdesc_put(); only unload it from the
1063			 * DMA map in hn_txdesc_put(), if it was loaded.
1064			 */
1065			txd->m = NULL;
1066			freed = hn_txdesc_put(sc, txd);
1067			KASSERT(freed != 0,
1068			    ("fail to free txd upon send error"));
1069
1070			sc->hn_send_failed++;
1071			IF_PREPEND(&ifp->if_snd, m_head);
1072			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1073			break;
1074		}
1075	}
1076}
1077
1078/*
1079 * Link up/down notification
1080 */
1081void
1082netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status)
1083{
1084	hn_softc_t *sc = device_get_softc(device_obj->device);
1085
1086	if (sc == NULL) {
1087		return;
1088	}
1089
1090	if (status == 1) {
1091		sc->hn_carrier = 1;
1092	} else {
1093		sc->hn_carrier = 0;
1094	}
1095}
1096
1097/*
1098 * Append the specified data to the indicated mbuf chain,
1099 * Extend the mbuf chain if the new data does not fit in
1100 * existing space.
1101 *
1102 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
1103 * There should be an equivalent in the kernel mbuf code,
1104 * but there does not appear to be one yet.
1105 *
1106 * Differs from m_append() in that additional mbufs are
1107 * allocated with cluster size MJUMPAGESIZE, and filled
1108 * accordingly.
1109 *
1110 * Return 1 if able to complete the job; otherwise 0.
1111 */
1112static int
1113hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
1114{
1115	struct mbuf *m, *n;
1116	int remainder, space;
1117
1118	for (m = m0; m->m_next != NULL; m = m->m_next)
1119		;
1120	remainder = len;
1121	space = M_TRAILINGSPACE(m);
1122	if (space > 0) {
1123		/*
1124		 * Copy into available space.
1125		 */
1126		if (space > remainder)
1127			space = remainder;
1128		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1129		m->m_len += space;
1130		cp += space;
1131		remainder -= space;
1132	}
1133	while (remainder > 0) {
1134		/*
1135		 * Allocate a new mbuf; could check space
1136		 * and allocate a cluster instead.
1137		 */
1138		n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
1139		if (n == NULL)
1140			break;
1141		n->m_len = min(MJUMPAGESIZE, remainder);
1142		bcopy(cp, mtod(n, caddr_t), n->m_len);
1143		cp += n->m_len;
1144		remainder -= n->m_len;
1145		m->m_next = n;
1146		m = n;
1147	}
1148	if (m0->m_flags & M_PKTHDR)
1149		m0->m_pkthdr.len += len - remainder;
1150
1151	return (remainder == 0);
1152}
1153
1154
1155/*
1156 * Called when we receive a data packet from the "wire" on the
1157 * specified device
1158 *
1159 * Note:  This is no longer used as a callback
1160 */
1161int
1162netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
1163    rndis_tcp_ip_csum_info *csum_info)
1164{
1165	hn_softc_t *sc = (hn_softc_t *)device_get_softc(device_ctx->device);
1166	struct mbuf *m_new;
1167	struct ifnet *ifp;
1168	device_t dev = device_ctx->device;
1169	int size, do_lro = 0;
1170
1171	if (sc == NULL) {
1172		return (0); /* TODO: KYS how can this be! */
1173	}
1174
1175	ifp = sc->hn_ifp;
1176
1177	ifp = sc->arpcom.ac_ifp;
1178
1179	if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1180		return (0);
1181	}
1182
1183	/*
1184	 * Bail out if packet contains more data than configured MTU.
1185	 */
1186	if (packet->tot_data_buf_len > (ifp->if_mtu + ETHER_HDR_LEN)) {
1187		return (0);
1188	} else if (packet->tot_data_buf_len <= MHLEN) {
1189		m_new = m_gethdr(M_NOWAIT, MT_DATA);
1190		if (m_new == NULL)
1191			return (0);
1192		memcpy(mtod(m_new, void *), packet->data,
1193		    packet->tot_data_buf_len);
1194		m_new->m_pkthdr.len = m_new->m_len = packet->tot_data_buf_len;
1195		sc->hn_small_pkts++;
1196	} else {
1197		/*
1198		 * Get an mbuf with a cluster.  For packets 2K or less,
1199		 * get a standard 2K cluster.  For anything larger, get a
1200		 * 4K cluster.  Any buffers larger than 4K can cause problems
1201		 * if looped around to the Hyper-V TX channel, so avoid them.
1202		 */
1203		size = MCLBYTES;
1204		if (packet->tot_data_buf_len > MCLBYTES) {
1205			/* 4096 */
1206			size = MJUMPAGESIZE;
1207		}
1208
1209		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
1210		if (m_new == NULL) {
1211			device_printf(dev, "alloc mbuf failed.\n");
1212			return (0);
1213		}
1214
1215		hv_m_append(m_new, packet->tot_data_buf_len, packet->data);
1216	}
1217	m_new->m_pkthdr.rcvif = ifp;
1218
1219	/* receive side checksum offload */
1220	if (NULL != csum_info) {
1221		/* IP csum offload */
1222		if (csum_info->receive.ip_csum_succeeded) {
1223			m_new->m_pkthdr.csum_flags |=
1224			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
1225			sc->hn_csum_ip++;
1226		}
1227
1228		/* TCP csum offload */
1229		if (csum_info->receive.tcp_csum_succeeded) {
1230			m_new->m_pkthdr.csum_flags |=
1231			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1232			m_new->m_pkthdr.csum_data = 0xffff;
1233			sc->hn_csum_tcp++;
1234		}
1235
1236		if (csum_info->receive.ip_csum_succeeded &&
1237		    csum_info->receive.tcp_csum_succeeded)
1238			do_lro = 1;
1239	} else {
1240		const struct ether_header *eh;
1241		uint16_t etype;
1242		int hoff;
1243
1244		hoff = sizeof(*eh);
1245		if (m_new->m_len < hoff)
1246			goto skip;
1247		eh = mtod(m_new, struct ether_header *);
1248		etype = ntohs(eh->ether_type);
1249		if (etype == ETHERTYPE_VLAN) {
1250			const struct ether_vlan_header *evl;
1251
1252			hoff = sizeof(*evl);
1253			if (m_new->m_len < hoff)
1254				goto skip;
1255			evl = mtod(m_new, struct ether_vlan_header *);
1256			etype = ntohs(evl->evl_proto);
1257		}
1258
1259		if (etype == ETHERTYPE_IP) {
1260			int pr;
1261
1262			pr = hn_check_iplen(m_new, hoff);
1263			if (pr == IPPROTO_TCP) {
1264				if (sc->hn_trust_hosttcp) {
1265					sc->hn_csum_trusted++;
1266					m_new->m_pkthdr.csum_flags |=
1267					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
1268					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1269					m_new->m_pkthdr.csum_data = 0xffff;
1270				}
1271				/* Rely on SW csum verification though... */
1272				do_lro = 1;
1273			}
1274		}
1275	}
1276skip:
1277	if ((packet->vlan_tci != 0) &&
1278	    (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0) {
1279		m_new->m_pkthdr.ether_vtag = packet->vlan_tci;
1280		m_new->m_flags |= M_VLANTAG;
1281	}
1282
1283	/*
1284	 * Note:  Moved RX completion back to hv_nv_on_receive() so all
1285	 * messages (not just data messages) will trigger a response.
1286	 */
1287
1288	ifp->if_ipackets++;
1289
1290	if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
1291#if defined(INET) || defined(INET6)
1292		struct lro_ctrl *lro = &sc->hn_lro;
1293
1294		if (lro->lro_cnt) {
1295			sc->hn_lro_tried++;
1296			if (tcp_lro_rx(lro, m_new, 0) == 0) {
1297				/* DONE! */
1298				return 0;
1299			}
1300		}
1301#endif
1302	}
1303
1304	/* We're not holding the lock here, so don't release it */
1305	(*ifp->if_input)(ifp, m_new);
1306
1307	return (0);
1308}
1309
1310void
1311netvsc_recv_rollup(struct hv_device *device_ctx)
1312{
1313#if defined(INET) || defined(INET6)
1314	hn_softc_t *sc = device_get_softc(device_ctx->device);
1315	struct lro_ctrl *lro = &sc->hn_lro;
1316	struct lro_entry *queued;
1317
1318	while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1319		SLIST_REMOVE_HEAD(&lro->lro_active, next);
1320		tcp_lro_flush(lro, queued);
1321	}
1322#endif
1323}
1324
1325/*
1326 * Rules for using sc->temp_unusable:
1327 * 1.  sc->temp_unusable can only be read or written while holding NV_LOCK()
1328 * 2.  code reading sc->temp_unusable under NV_LOCK(), and finding
1329 *     sc->temp_unusable set, must release NV_LOCK() and exit
1330 * 3.  to retain exclusive control of the interface,
1331 *     sc->temp_unusable must be set by code before releasing NV_LOCK()
1332 * 4.  only code setting sc->temp_unusable can clear sc->temp_unusable
1333 * 5.  code setting sc->temp_unusable must eventually clear sc->temp_unusable
1334 */
1335
1336/*
1337 * Standard ioctl entry point.  Called when the user wants to configure
1338 * the interface.
1339 */
1340static int
1341hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1342{
1343	hn_softc_t *sc = ifp->if_softc;
1344	struct ifreq *ifr = (struct ifreq *)data;
1345#ifdef INET
1346	struct ifaddr *ifa = (struct ifaddr *)data;
1347#endif
1348	netvsc_device_info device_info;
1349	struct hv_device *hn_dev;
1350	int mask, error = 0;
1351	int retry_cnt = 500;
1352
1353	switch(cmd) {
1354
1355	case SIOCSIFADDR:
1356#ifdef INET
1357		if (ifa->ifa_addr->sa_family == AF_INET) {
1358			ifp->if_flags |= IFF_UP;
1359			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
1360				hn_ifinit(sc);
1361			arp_ifinit(ifp, ifa);
1362		} else
1363#endif
1364		error = ether_ioctl(ifp, cmd, data);
1365		break;
1366	case SIOCSIFMTU:
1367		hn_dev = vmbus_get_devctx(sc->hn_dev);
1368
1369		/* Check MTU value change */
1370		if (ifp->if_mtu == ifr->ifr_mtu)
1371			break;
1372
1373		if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) {
1374			error = EINVAL;
1375			break;
1376		}
1377
1378		/* Obtain and record requested MTU */
1379		ifp->if_mtu = ifr->ifr_mtu;
1380		/*
1381		 * Make sure that LRO high watermark is still valid,
1382		 * after MTU change (the 2*MTU limit).
1383		 */
1384		if (!HN_LRO_HIWAT_ISVALID(sc, sc->hn_lro_hiwat))
1385			hn_set_lro_hiwat(sc, HN_LRO_HIWAT_MTULIM(ifp));
1386
1387		do {
1388			NV_LOCK(sc);
1389			if (!sc->temp_unusable) {
1390				sc->temp_unusable = TRUE;
1391				retry_cnt = -1;
1392			}
1393			NV_UNLOCK(sc);
1394			if (retry_cnt > 0) {
1395				retry_cnt--;
1396				DELAY(5 * 1000);
1397			}
1398		} while (retry_cnt > 0);
1399
1400		if (retry_cnt == 0) {
1401			error = EINVAL;
1402			break;
1403		}
1404
1405		/* We must remove and add back the device to cause the new
1406		 * MTU to take effect.  This includes tearing down, but not
1407		 * deleting the channel, then bringing it back up.
1408		 */
1409		error = hv_rf_on_device_remove(hn_dev, HV_RF_NV_RETAIN_CHANNEL);
1410		if (error) {
1411			NV_LOCK(sc);
1412			sc->temp_unusable = FALSE;
1413			NV_UNLOCK(sc);
1414			break;
1415		}
1416		error = hv_rf_on_device_add(hn_dev, &device_info);
1417		if (error) {
1418			NV_LOCK(sc);
1419			sc->temp_unusable = FALSE;
1420			NV_UNLOCK(sc);
1421			break;
1422		}
1423
1424		sc->hn_tx_chimney_max = sc->net_dev->send_section_size;
1425		if (sc->hn_tx_chimney_size > sc->hn_tx_chimney_max)
1426			sc->hn_tx_chimney_size = sc->hn_tx_chimney_max;
1427		hn_ifinit_locked(sc);
1428
1429		NV_LOCK(sc);
1430		sc->temp_unusable = FALSE;
1431		NV_UNLOCK(sc);
1432		break;
1433	case SIOCSIFFLAGS:
1434		do {
1435                       NV_LOCK(sc);
1436                       if (!sc->temp_unusable) {
1437                               sc->temp_unusable = TRUE;
1438                               retry_cnt = -1;
1439                       }
1440                       NV_UNLOCK(sc);
1441                       if (retry_cnt > 0) {
1442                      	        retry_cnt--;
1443                        	DELAY(5 * 1000);
1444                       }
1445                } while (retry_cnt > 0);
1446
1447                if (retry_cnt == 0) {
1448                       error = EINVAL;
1449                       break;
1450                }
1451
1452		if (ifp->if_flags & IFF_UP) {
1453			/*
1454			 * If only the state of the PROMISC flag changed,
1455			 * then just use the 'set promisc mode' command
1456			 * instead of reinitializing the entire NIC. Doing
1457			 * a full re-init means reloading the firmware and
1458			 * waiting for it to start up, which may take a
1459			 * second or two.
1460			 */
1461#ifdef notyet
1462			/* Fixme:  Promiscuous mode? */
1463			if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
1464			    ifp->if_flags & IFF_PROMISC &&
1465			    !(sc->hn_if_flags & IFF_PROMISC)) {
1466				/* do something here for Hyper-V */
1467			} else if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
1468			    !(ifp->if_flags & IFF_PROMISC) &&
1469			    sc->hn_if_flags & IFF_PROMISC) {
1470				/* do something here for Hyper-V */
1471			} else
1472#endif
1473				hn_ifinit_locked(sc);
1474		} else {
1475			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1476				hn_stop(sc);
1477			}
1478		}
1479		NV_LOCK(sc);
1480		sc->temp_unusable = FALSE;
1481		NV_UNLOCK(sc);
1482		sc->hn_if_flags = ifp->if_flags;
1483		error = 0;
1484		break;
1485	case SIOCSIFCAP:
1486		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
1487		if (mask & IFCAP_TXCSUM) {
1488			if (IFCAP_TXCSUM & ifp->if_capenable) {
1489				ifp->if_capenable &= ~IFCAP_TXCSUM;
1490				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
1491			} else {
1492				ifp->if_capenable |= IFCAP_TXCSUM;
1493				/*
1494				 * Only enable UDP checksum offloading on
1495				 * Windows Server 2012R2 or later releases.
1496				 */
1497				if (hv_vmbus_protocal_version >=
1498				    HV_VMBUS_VERSION_WIN8_1) {
1499					ifp->if_hwassist |=
1500					    (CSUM_TCP | CSUM_UDP);
1501				} else {
1502					ifp->if_hwassist |= CSUM_TCP;
1503				}
1504			}
1505		}
1506
1507		if (mask & IFCAP_RXCSUM) {
1508			if (IFCAP_RXCSUM & ifp->if_capenable) {
1509				ifp->if_capenable &= ~IFCAP_RXCSUM;
1510			} else {
1511				ifp->if_capenable |= IFCAP_RXCSUM;
1512			}
1513		}
1514		if (mask & IFCAP_LRO)
1515			ifp->if_capenable ^= IFCAP_LRO;
1516
1517		if (mask & IFCAP_TSO4) {
1518			ifp->if_capenable ^= IFCAP_TSO4;
1519			ifp->if_hwassist ^= CSUM_IP_TSO;
1520		}
1521
1522		if (mask & IFCAP_TSO6) {
1523			ifp->if_capenable ^= IFCAP_TSO6;
1524			ifp->if_hwassist ^= CSUM_IP6_TSO;
1525		}
1526
1527		error = 0;
1528		break;
1529	case SIOCADDMULTI:
1530	case SIOCDELMULTI:
1531#ifdef notyet
1532		/* Fixme:  Multicast mode? */
1533		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1534			NV_LOCK(sc);
1535			netvsc_setmulti(sc);
1536			NV_UNLOCK(sc);
1537			error = 0;
1538		}
1539#endif
1540		error = EINVAL;
1541		break;
1542	case SIOCSIFMEDIA:
1543	case SIOCGIFMEDIA:
1544		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
1545		break;
1546	default:
1547		error = ether_ioctl(ifp, cmd, data);
1548		break;
1549	}
1550
1551	return (error);
1552}
1553
1554/*
1555 *
1556 */
1557static void
1558hn_stop(hn_softc_t *sc)
1559{
1560	struct ifnet *ifp;
1561	int ret;
1562	struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
1563
1564	ifp = sc->hn_ifp;
1565
1566	if (bootverbose)
1567		printf(" Closing Device ...\n");
1568
1569	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
1570	if_link_state_change(ifp, LINK_STATE_DOWN);
1571	sc->hn_initdone = 0;
1572
1573	ret = hv_rf_on_close(device_ctx);
1574}
1575
1576/*
1577 * FreeBSD transmit entry point
1578 */
1579static void
1580hn_start(struct ifnet *ifp)
1581{
1582	hn_softc_t *sc;
1583
1584	sc = ifp->if_softc;
1585	NV_LOCK(sc);
1586	if (sc->temp_unusable) {
1587		NV_UNLOCK(sc);
1588		return;
1589	}
1590	hn_start_locked(ifp);
1591	NV_UNLOCK(sc);
1592}
1593
1594/*
1595 *
1596 */
1597static void
1598hn_ifinit_locked(hn_softc_t *sc)
1599{
1600	struct ifnet *ifp;
1601	struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
1602	int ret;
1603
1604	ifp = sc->hn_ifp;
1605
1606	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1607		return;
1608	}
1609
1610	hv_promisc_mode = 1;
1611
1612	ret = hv_rf_on_open(device_ctx);
1613	if (ret != 0) {
1614		return;
1615	} else {
1616		sc->hn_initdone = 1;
1617	}
1618	ifp->if_drv_flags |= IFF_DRV_RUNNING;
1619	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
1620	if_link_state_change(ifp, LINK_STATE_UP);
1621}
1622
1623/*
1624 *
1625 */
1626static void
1627hn_ifinit(void *xsc)
1628{
1629	hn_softc_t *sc = xsc;
1630
1631	NV_LOCK(sc);
1632	if (sc->temp_unusable) {
1633		NV_UNLOCK(sc);
1634		return;
1635	}
1636	sc->temp_unusable = TRUE;
1637	NV_UNLOCK(sc);
1638
1639	hn_ifinit_locked(sc);
1640
1641	NV_LOCK(sc);
1642	sc->temp_unusable = FALSE;
1643	NV_UNLOCK(sc);
1644}
1645
1646#ifdef LATER
1647/*
1648 *
1649 */
1650static void
1651hn_watchdog(struct ifnet *ifp)
1652{
1653	hn_softc_t *sc;
1654	sc = ifp->if_softc;
1655
1656	printf("hn%d: watchdog timeout -- resetting\n", sc->hn_unit);
1657	hn_ifinit(sc);    /*???*/
1658	ifp->if_oerrors++;
1659}
1660#endif
1661
1662#ifdef HN_LRO_HIWAT
1663static int
1664hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS)
1665{
1666	struct hn_softc *sc = arg1;
1667	int hiwat, error;
1668
1669	hiwat = sc->hn_lro_hiwat;
1670	error = sysctl_handle_int(oidp, &hiwat, 0, req);
1671	if (error || req->newptr == NULL)
1672		return error;
1673
1674	if (!HN_LRO_HIWAT_ISVALID(sc, hiwat))
1675		return EINVAL;
1676
1677	if (sc->hn_lro_hiwat != hiwat)
1678		hn_set_lro_hiwat(sc, hiwat);
1679	return 0;
1680}
1681#endif	/* HN_LRO_HIWAT */
1682
1683static int
1684hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS)
1685{
1686	struct hn_softc *sc = arg1;
1687	int chimney_size, error;
1688
1689	chimney_size = sc->hn_tx_chimney_size;
1690	error = sysctl_handle_int(oidp, &chimney_size, 0, req);
1691	if (error || req->newptr == NULL)
1692		return error;
1693
1694	if (chimney_size > sc->hn_tx_chimney_max || chimney_size <= 0)
1695		return EINVAL;
1696
1697	if (sc->hn_tx_chimney_size != chimney_size)
1698		sc->hn_tx_chimney_size = chimney_size;
1699	return 0;
1700}
1701
1702static int
1703hn_check_iplen(const struct mbuf *m, int hoff)
1704{
1705	const struct ip *ip;
1706	int len, iphlen, iplen;
1707	const struct tcphdr *th;
1708	int thoff;				/* TCP data offset */
1709
1710	len = hoff + sizeof(struct ip);
1711
1712	/* The packet must be at least the size of an IP header. */
1713	if (m->m_pkthdr.len < len)
1714		return IPPROTO_DONE;
1715
1716	/* The fixed IP header must reside completely in the first mbuf. */
1717	if (m->m_len < len)
1718		return IPPROTO_DONE;
1719
1720	ip = mtodo(m, hoff);
1721
1722	/* Bound check the packet's stated IP header length. */
1723	iphlen = ip->ip_hl << 2;
1724	if (iphlen < sizeof(struct ip))		/* minimum header length */
1725		return IPPROTO_DONE;
1726
1727	/* The full IP header must reside completely in the one mbuf. */
1728	if (m->m_len < hoff + iphlen)
1729		return IPPROTO_DONE;
1730
1731	iplen = ntohs(ip->ip_len);
1732
1733	/*
1734	 * Check that the amount of data in the buffers is as
1735	 * at least much as the IP header would have us expect.
1736	 */
1737	if (m->m_pkthdr.len < hoff + iplen)
1738		return IPPROTO_DONE;
1739
1740	/*
1741	 * Ignore IP fragments.
1742	 */
1743	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
1744		return IPPROTO_DONE;
1745
1746	/*
1747	 * The TCP/IP or UDP/IP header must be entirely contained within
1748	 * the first fragment of a packet.
1749	 */
1750	switch (ip->ip_p) {
1751	case IPPROTO_TCP:
1752		if (iplen < iphlen + sizeof(struct tcphdr))
1753			return IPPROTO_DONE;
1754		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
1755			return IPPROTO_DONE;
1756		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
1757		thoff = th->th_off << 2;
1758		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
1759			return IPPROTO_DONE;
1760		if (m->m_len < hoff + iphlen + thoff)
1761			return IPPROTO_DONE;
1762		break;
1763	case IPPROTO_UDP:
1764		if (iplen < iphlen + sizeof(struct udphdr))
1765			return IPPROTO_DONE;
1766		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
1767			return IPPROTO_DONE;
1768		break;
1769	default:
1770		if (iplen < iphlen)
1771			return IPPROTO_DONE;
1772		break;
1773	}
1774	return ip->ip_p;
1775}
1776
1777static void
1778hn_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error)
1779{
1780	bus_addr_t *paddr = arg;
1781
1782	if (error)
1783		return;
1784
1785	KASSERT(nseg == 1, ("too many segments %d!", nseg));
1786	*paddr = segs->ds_addr;
1787}
1788
1789static int
1790hn_create_tx_ring(struct hn_softc *sc)
1791{
1792	bus_dma_tag_t parent_dtag;
1793	int error, i;
1794
1795	sc->hn_txdesc_cnt = HN_TX_DESC_CNT;
1796	sc->hn_txdesc = malloc(sizeof(struct hn_txdesc) * sc->hn_txdesc_cnt,
1797	    M_NETVSC, M_WAITOK | M_ZERO);
1798	SLIST_INIT(&sc->hn_txlist);
1799	mtx_init(&sc->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
1800
1801	parent_dtag = bus_get_dma_tag(sc->hn_dev);
1802
1803	/* DMA tag for RNDIS messages. */
1804	error = bus_dma_tag_create(parent_dtag, /* parent */
1805	    HN_RNDIS_MSG_ALIGN,		/* alignment */
1806	    HN_RNDIS_MSG_BOUNDARY,	/* boundary */
1807	    BUS_SPACE_MAXADDR,		/* lowaddr */
1808	    BUS_SPACE_MAXADDR,		/* highaddr */
1809	    NULL, NULL,			/* filter, filterarg */
1810	    HN_RNDIS_MSG_LEN,		/* maxsize */
1811	    1,				/* nsegments */
1812	    HN_RNDIS_MSG_LEN,		/* maxsegsize */
1813	    0,				/* flags */
1814	    NULL,			/* lockfunc */
1815	    NULL,			/* lockfuncarg */
1816	    &sc->hn_tx_rndis_dtag);
1817	if (error) {
1818		device_printf(sc->hn_dev, "failed to create rndis dmatag\n");
1819		return error;
1820	}
1821
1822	/* DMA tag for data. */
1823	error = bus_dma_tag_create(parent_dtag, /* parent */
1824	    1,				/* alignment */
1825	    HN_TX_DATA_BOUNDARY,	/* boundary */
1826	    BUS_SPACE_MAXADDR,		/* lowaddr */
1827	    BUS_SPACE_MAXADDR,		/* highaddr */
1828	    NULL, NULL,			/* filter, filterarg */
1829	    HN_TX_DATA_MAXSIZE,		/* maxsize */
1830	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
1831	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
1832	    0,				/* flags */
1833	    NULL,			/* lockfunc */
1834	    NULL,			/* lockfuncarg */
1835	    &sc->hn_tx_data_dtag);
1836	if (error) {
1837		device_printf(sc->hn_dev, "failed to create data dmatag\n");
1838		return error;
1839	}
1840
1841	for (i = 0; i < sc->hn_txdesc_cnt; ++i) {
1842		struct hn_txdesc *txd = &sc->hn_txdesc[i];
1843
1844		txd->sc = sc;
1845
1846		/*
1847		 * Allocate and load RNDIS messages.
1848		 */
1849        	error = bus_dmamem_alloc(sc->hn_tx_rndis_dtag,
1850		    (void **)&txd->rndis_msg,
1851		    BUS_DMA_WAITOK | BUS_DMA_COHERENT,
1852		    &txd->rndis_msg_dmap);
1853		if (error) {
1854			device_printf(sc->hn_dev,
1855			    "failed to allocate rndis_msg, %d\n", i);
1856			return error;
1857		}
1858
1859		error = bus_dmamap_load(sc->hn_tx_rndis_dtag,
1860		    txd->rndis_msg_dmap,
1861		    txd->rndis_msg, HN_RNDIS_MSG_LEN,
1862		    hn_dma_map_paddr, &txd->rndis_msg_paddr,
1863		    BUS_DMA_NOWAIT);
1864		if (error) {
1865			device_printf(sc->hn_dev,
1866			    "failed to load rndis_msg, %d\n", i);
1867			bus_dmamem_free(sc->hn_tx_rndis_dtag,
1868			    txd->rndis_msg, txd->rndis_msg_dmap);
1869			return error;
1870		}
1871
1872		/* DMA map for TX data. */
1873		error = bus_dmamap_create(sc->hn_tx_data_dtag, 0,
1874		    &txd->data_dmap);
1875		if (error) {
1876			device_printf(sc->hn_dev,
1877			    "failed to allocate tx data dmamap\n");
1878			bus_dmamap_unload(sc->hn_tx_rndis_dtag,
1879			    txd->rndis_msg_dmap);
1880			bus_dmamem_free(sc->hn_tx_rndis_dtag,
1881			    txd->rndis_msg, txd->rndis_msg_dmap);
1882			return error;
1883		}
1884
1885		/* All set, put it to list */
1886		txd->flags |= HN_TXD_FLAG_ONLIST;
1887		SLIST_INSERT_HEAD(&sc->hn_txlist, txd, link);
1888	}
1889	sc->hn_txdesc_avail = sc->hn_txdesc_cnt;
1890
1891	return 0;
1892}
1893
1894static void
1895hn_destroy_tx_ring(struct hn_softc *sc)
1896{
1897	struct hn_txdesc *txd;
1898
1899	while ((txd = SLIST_FIRST(&sc->hn_txlist)) != NULL) {
1900		KASSERT(txd->m == NULL, ("still has mbuf installed"));
1901		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1902		    ("still dma mapped"));
1903		SLIST_REMOVE_HEAD(&sc->hn_txlist, link);
1904
1905		bus_dmamap_unload(sc->hn_tx_rndis_dtag,
1906		    txd->rndis_msg_dmap);
1907		bus_dmamem_free(sc->hn_tx_rndis_dtag,
1908		    txd->rndis_msg, txd->rndis_msg_dmap);
1909
1910		bus_dmamap_destroy(sc->hn_tx_data_dtag, txd->data_dmap);
1911	}
1912
1913	if (sc->hn_tx_data_dtag != NULL)
1914		bus_dma_tag_destroy(sc->hn_tx_data_dtag);
1915	if (sc->hn_tx_rndis_dtag != NULL)
1916		bus_dma_tag_destroy(sc->hn_tx_rndis_dtag);
1917	free(sc->hn_txdesc, M_NETVSC);
1918	mtx_destroy(&sc->hn_txlist_spin);
1919}
1920
1921static device_method_t netvsc_methods[] = {
1922        /* Device interface */
1923        DEVMETHOD(device_probe,         netvsc_probe),
1924        DEVMETHOD(device_attach,        netvsc_attach),
1925        DEVMETHOD(device_detach,        netvsc_detach),
1926        DEVMETHOD(device_shutdown,      netvsc_shutdown),
1927
1928        { 0, 0 }
1929};
1930
1931static driver_t netvsc_driver = {
1932        NETVSC_DEVNAME,
1933        netvsc_methods,
1934        sizeof(hn_softc_t)
1935};
1936
1937static devclass_t netvsc_devclass;
1938
1939DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0);
1940MODULE_VERSION(hn, 1);
1941MODULE_DEPEND(hn, vmbus, 1, 1, 1);
1942