1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2020 Microsoft Corp.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice unmodified, this list of conditions, and the following
12 *    disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD$");
31
32#include <sys/param.h>
33#include <sys/bus.h>
34#include <sys/domain.h>
35#include <sys/lock.h>
36#include <sys/kernel.h>
37#include <sys/types.h>
38#include <sys/malloc.h>
39#include <sys/module.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
42#include <sys/protosw.h>
43#include <sys/socket.h>
44#include <sys/sysctl.h>
45#include <sys/sysproto.h>
46#include <sys/systm.h>
47#include <sys/sockbuf.h>
48#include <sys/sx.h>
49#include <sys/uio.h>
50
51#include <net/vnet.h>
52
53#include <dev/hyperv/vmbus/vmbus_reg.h>
54
55#include "hv_sock.h"
56
57#define HVSOCK_DBG_NONE			0x0
58#define HVSOCK_DBG_INFO			0x1
59#define HVSOCK_DBG_ERR			0x2
60#define HVSOCK_DBG_VERBOSE		0x3
61
62
63SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket");
64
65static int hvs_dbg_level;
66SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level,
67    0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose");
68
69
70#define HVSOCK_DBG(level, ...) do {					\
71	if (hvs_dbg_level >= (level))					\
72		printf(__VA_ARGS__);					\
73	} while (0)
74
75MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures");
76
77/* The MTU is 16KB per host side's design */
78#define HVSOCK_MTU_SIZE		(1024 * 16)
79#define HVSOCK_SEND_BUF_SZ	(PAGE_SIZE - sizeof(struct vmpipe_proto_header))
80
81#define HVSOCK_HEADER_LEN	(sizeof(struct hvs_pkt_header))
82
83#define HVSOCK_PKT_LEN(payload_len)	(HVSOCK_HEADER_LEN + \
84					 roundup2(payload_len, 8) + \
85					 sizeof(uint64_t))
86
87
88static struct domain		hv_socket_domain;
89
90/*
91 * HyperV Transport sockets
92 */
93static struct pr_usrreqs	hvs_trans_usrreqs = {
94	.pru_attach =		hvs_trans_attach,
95	.pru_bind =		hvs_trans_bind,
96	.pru_listen =		hvs_trans_listen,
97	.pru_accept =		hvs_trans_accept,
98	.pru_connect =		hvs_trans_connect,
99	.pru_peeraddr =		hvs_trans_peeraddr,
100	.pru_sockaddr =		hvs_trans_sockaddr,
101	.pru_soreceive =	hvs_trans_soreceive,
102	.pru_sosend =		hvs_trans_sosend,
103	.pru_disconnect =	hvs_trans_disconnect,
104	.pru_close =		hvs_trans_close,
105	.pru_detach =		hvs_trans_detach,
106	.pru_shutdown =		hvs_trans_shutdown,
107	.pru_abort =		hvs_trans_abort,
108};
109
110/*
111 * Definitions of protocols supported in HyperV socket domain
112 */
113static struct protosw		hv_socket_protosw[] = {
114{
115	.pr_type =		SOCK_STREAM,
116	.pr_domain =		&hv_socket_domain,
117	.pr_protocol =		HYPERV_SOCK_PROTO_TRANS,
118	.pr_flags =		PR_CONNREQUIRED,
119	.pr_init =		hvs_trans_init,
120	.pr_usrreqs =		&hvs_trans_usrreqs,
121},
122};
123
124static struct domain		hv_socket_domain = {
125	.dom_family =		AF_HYPERV,
126	.dom_name =		"hyperv",
127	.dom_protosw =		hv_socket_protosw,
128	.dom_protoswNPROTOSW =	&hv_socket_protosw[nitems(hv_socket_protosw)]
129};
130
131VNET_DOMAIN_SET(hv_socket_);
132
133#define MAX_PORT			((uint32_t)0xFFFFFFFF)
134#define MIN_PORT			((uint32_t)0x0)
135
136/* 00000000-facb-11e6-bd58-64006a7986d3 */
137static const struct hyperv_guid srv_id_template = {
138	.hv_guid = {
139	    0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11,
140	    0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 }
141};
142
143static int		hvsock_br_callback(void *, int, void *);
144static uint32_t		hvsock_canread_check(struct hvs_pcb *);
145static uint32_t		hvsock_canwrite_check(struct hvs_pcb *);
146static int		hvsock_send_data(struct vmbus_channel *chan,
147    struct uio *uio, uint32_t to_write, struct sockbuf *sb);
148
149
150
151/* Globals */
152static struct sx		hvs_trans_socks_sx;
153static struct mtx		hvs_trans_socks_mtx;
154static LIST_HEAD(, hvs_pcb)	hvs_trans_bound_socks;
155static LIST_HEAD(, hvs_pcb)	hvs_trans_connected_socks;
156static uint32_t			previous_auto_bound_port;
157
158static void
159hvsock_print_guid(struct hyperv_guid *guid)
160{
161	unsigned char *p = (unsigned char *)guid;
162
163	HVSOCK_DBG(HVSOCK_DBG_INFO,
164	    "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n",
165	    *(unsigned int *)p,
166	    *((unsigned short *) &p[4]),
167	    *((unsigned short *) &p[6]),
168	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
169}
170
171static bool
172is_valid_srv_id(const struct hyperv_guid *id)
173{
174	return !memcmp(&id->hv_guid[4],
175	    &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4);
176}
177
178static unsigned int
179get_port_by_srv_id(const struct hyperv_guid *srv_id)
180{
181	return *((const unsigned int *)srv_id);
182}
183
184static void
185set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port)
186{
187	*((unsigned int *)srv_id) = port;
188}
189
190
191static void
192__hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list)
193{
194	struct hvs_pcb *p = NULL;
195
196	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
197
198	if (!pcb)
199		return;
200
201	if (list & HVS_LIST_BOUND) {
202		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
203			if  (p == pcb)
204				LIST_REMOVE(p, bound_next);
205	}
206
207	if (list & HVS_LIST_CONNECTED) {
208		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
209			if (p == pcb)
210				LIST_REMOVE(pcb, connected_next);
211	}
212}
213
214static void
215__hvs_remove_socket_from_list(struct socket *so, unsigned char list)
216{
217	struct hvs_pcb *pcb = so2hvspcb(so);
218
219	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
220
221	__hvs_remove_pcb_from_list(pcb, list);
222}
223
224static void
225__hvs_insert_socket_on_list(struct socket *so, unsigned char list)
226{
227	struct hvs_pcb *pcb = so2hvspcb(so);
228
229	if (list & HVS_LIST_BOUND)
230		LIST_INSERT_HEAD(&hvs_trans_bound_socks,
231		   pcb, bound_next);
232
233	if (list & HVS_LIST_CONNECTED)
234		LIST_INSERT_HEAD(&hvs_trans_connected_socks,
235		   pcb, connected_next);
236}
237
238void
239hvs_remove_socket_from_list(struct socket *so, unsigned char list)
240{
241	if (!so || !so->so_pcb) {
242		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
243		    "%s: socket or so_pcb is null\n", __func__);
244		return;
245	}
246
247	mtx_lock(&hvs_trans_socks_mtx);
248	__hvs_remove_socket_from_list(so, list);
249	mtx_unlock(&hvs_trans_socks_mtx);
250}
251
252static void
253hvs_insert_socket_on_list(struct socket *so, unsigned char list)
254{
255	if (!so || !so->so_pcb) {
256		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
257		    "%s: socket or so_pcb is null\n", __func__);
258		return;
259	}
260
261	mtx_lock(&hvs_trans_socks_mtx);
262	__hvs_insert_socket_on_list(so, list);
263	mtx_unlock(&hvs_trans_socks_mtx);
264}
265
266static struct socket *
267__hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
268{
269	struct hvs_pcb *p = NULL;
270
271	if (list & HVS_LIST_BOUND)
272		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
273			if (p->so != NULL &&
274			    addr->hvs_port == p->local_addr.hvs_port)
275				return p->so;
276
277	if (list & HVS_LIST_CONNECTED)
278		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
279			if (p->so != NULL &&
280			    addr->hvs_port == p->local_addr.hvs_port)
281				return p->so;
282
283	return NULL;
284}
285
286static struct socket *
287hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
288{
289	struct socket *s = NULL;
290
291	mtx_lock(&hvs_trans_socks_mtx);
292	s = __hvs_find_socket_on_list(addr, list);
293	mtx_unlock(&hvs_trans_socks_mtx);
294
295	return s;
296}
297
298static inline void
299hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port)
300{
301	memset(addr, 0, sizeof(*addr));
302	addr->sa_family = AF_HYPERV;
303	addr->sa_len = sizeof(*addr);
304	addr->hvs_port = port;
305}
306
307void
308hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id)
309{
310	hvs_addr_set(addr, get_port_by_srv_id(svr_id));
311}
312
313int
314hvs_trans_lock(void)
315{
316	sx_xlock(&hvs_trans_socks_sx);
317	return (0);
318}
319
320void
321hvs_trans_unlock(void)
322{
323	sx_xunlock(&hvs_trans_socks_sx);
324}
325
326void
327hvs_trans_init(void)
328{
329	/* Skip initialization of globals for non-default instances. */
330	if (!IS_DEFAULT_VNET(curvnet))
331		return;
332
333	if (vm_guest != VM_GUEST_HV)
334		return;
335
336	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
337	    "%s: HyperV Socket hvs_trans_init called\n", __func__);
338
339	/* Initialize Globals */
340	previous_auto_bound_port = MAX_PORT;
341	sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx");
342	mtx_init(&hvs_trans_socks_mtx,
343	    "hvs_trans_socks_mtx", NULL, MTX_DEF);
344	LIST_INIT(&hvs_trans_bound_socks);
345	LIST_INIT(&hvs_trans_connected_socks);
346}
347
348/*
349 * Called in two cases:
350 * 1) When user calls socket();
351 * 2) When we accept new incoming conneciton and call sonewconn().
352 */
353int
354hvs_trans_attach(struct socket *so, int proto, struct thread *td)
355{
356	struct hvs_pcb *pcb = so2hvspcb(so);
357
358	if (vm_guest != VM_GUEST_HV)
359		return (ESOCKTNOSUPPORT);
360
361	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
362	    "%s: HyperV Socket hvs_trans_attach called\n", __func__);
363
364	if (so->so_type != SOCK_STREAM)
365		return (ESOCKTNOSUPPORT);
366
367	if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS)
368		return (EPROTONOSUPPORT);
369
370	if (pcb != NULL)
371		return (EISCONN);
372	pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO);
373	if (pcb == NULL)
374		return (ENOMEM);
375
376	pcb->so = so;
377	so->so_pcb = (void *)pcb;
378
379	return (0);
380}
381
382void
383hvs_trans_detach(struct socket *so)
384{
385	struct hvs_pcb *pcb;
386
387	if (vm_guest != VM_GUEST_HV)
388		return;
389
390	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
391	    "%s: HyperV Socket hvs_trans_detach called\n", __func__);
392
393	(void) hvs_trans_lock();
394	pcb = so2hvspcb(so);
395	if (pcb == NULL) {
396		hvs_trans_unlock();
397		return;
398	}
399
400	if (SOLISTENING(so)) {
401		bzero(pcb, sizeof(*pcb));
402		free(pcb, M_HVSOCK);
403	}
404
405	so->so_pcb = NULL;
406
407	hvs_trans_unlock();
408}
409
410int
411hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td)
412{
413	struct hvs_pcb *pcb = so2hvspcb(so);
414	struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr;
415	int error = 0;
416
417	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
418	    "%s: HyperV Socket hvs_trans_bind called\n", __func__);
419
420	if (sa == NULL) {
421		return (EINVAL);
422	}
423
424	if (pcb == NULL) {
425		return (EINVAL);
426	}
427
428	if (sa->sa_family != AF_HYPERV) {
429		HVSOCK_DBG(HVSOCK_DBG_ERR,
430		    "%s: Not supported, sa_family is %u\n",
431		    __func__, sa->sa_family);
432		return (EAFNOSUPPORT);
433	}
434	if (sa->sa_len != sizeof(*sa)) {
435		HVSOCK_DBG(HVSOCK_DBG_ERR,
436		    "%s: Not supported, sa_len is %u\n",
437		    __func__, sa->sa_len);
438		return (EINVAL);
439	}
440
441	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
442	    "%s: binding port = 0x%x\n", __func__, sa->hvs_port);
443
444	mtx_lock(&hvs_trans_socks_mtx);
445	if (__hvs_find_socket_on_list(sa,
446	    HVS_LIST_BOUND | HVS_LIST_CONNECTED)) {
447		error = EADDRINUSE;
448	} else {
449		/*
450		 * The address is available for us to bind.
451		 * Add socket to the bound list.
452		 */
453		hvs_addr_set(&pcb->local_addr, sa->hvs_port);
454		hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY);
455		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
456	}
457	mtx_unlock(&hvs_trans_socks_mtx);
458
459	return (error);
460}
461
462int
463hvs_trans_listen(struct socket *so, int backlog, struct thread *td)
464{
465	struct hvs_pcb *pcb = so2hvspcb(so);
466	struct socket *bound_so;
467	int error;
468
469	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
470	    "%s: HyperV Socket hvs_trans_listen called\n", __func__);
471
472	if (pcb == NULL)
473		return (EINVAL);
474
475	/* Check if the address is already bound and it was by us. */
476	bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND);
477	if (bound_so == NULL || bound_so != so) {
478		HVSOCK_DBG(HVSOCK_DBG_ERR,
479		    "%s: Address not bound or not by us.\n", __func__);
480		return (EADDRNOTAVAIL);
481	}
482
483	SOCK_LOCK(so);
484	error = solisten_proto_check(so);
485	if (error == 0)
486		solisten_proto(so, backlog);
487	SOCK_UNLOCK(so);
488
489	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
490	    "%s: HyperV Socket listen error = %d\n", __func__, error);
491	return (error);
492}
493
494int
495hvs_trans_accept(struct socket *so, struct sockaddr **nam)
496{
497	struct hvs_pcb *pcb = so2hvspcb(so);
498
499	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
500	    "%s: HyperV Socket hvs_trans_accept called\n", __func__);
501
502	if (pcb == NULL)
503		return (EINVAL);
504
505	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr,
506	    M_NOWAIT);
507
508	return ((*nam == NULL) ? ENOMEM : 0);
509}
510
511int
512hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
513{
514	struct hvs_pcb *pcb = so2hvspcb(so);
515	struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam;
516	bool found_auto_bound_port = false;
517	int i, error = 0;
518
519	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
520	    "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n",
521	    __func__, raddr->hvs_port);
522
523	if (pcb == NULL)
524		return (EINVAL);
525
526	/* Verify the remote address */
527	if (raddr == NULL)
528		return (EINVAL);
529	if (raddr->sa_family != AF_HYPERV)
530		return (EAFNOSUPPORT);
531	if (raddr->sa_len != sizeof(*raddr))
532		return (EINVAL);
533
534	mtx_lock(&hvs_trans_socks_mtx);
535	if (so->so_state &
536	    (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) {
537			HVSOCK_DBG(HVSOCK_DBG_ERR,
538			    "%s: socket connect in progress\n",
539			    __func__);
540			error = EINPROGRESS;
541			goto out;
542	}
543
544	/*
545	 * Find an available port for us to auto bind the local
546	 * address.
547	 */
548	hvs_addr_set(&pcb->local_addr, 0);
549
550	for (i = previous_auto_bound_port - 1;
551	    i != previous_auto_bound_port; i --) {
552		if (i == MIN_PORT)
553			i = MAX_PORT;
554
555		pcb->local_addr.hvs_port = i;
556
557		if (__hvs_find_socket_on_list(&pcb->local_addr,
558		    HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) {
559			found_auto_bound_port = true;
560			previous_auto_bound_port = i;
561			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
562			    "%s: found local bound port is %x\n",
563			    __func__, pcb->local_addr.hvs_port);
564			break;
565		}
566	}
567
568	if (found_auto_bound_port == true) {
569		/* Found available port for auto bound, put on list */
570		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
571		/* Set VM service ID */
572		pcb->vm_srv_id = srv_id_template;
573		set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port);
574		/* Set host service ID and remote port */
575		pcb->host_srv_id = srv_id_template;
576		set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port);
577		hvs_addr_set(&pcb->remote_addr, raddr->hvs_port);
578
579		/* Change the socket state to SS_ISCONNECTING */
580		soisconnecting(so);
581	} else {
582		HVSOCK_DBG(HVSOCK_DBG_ERR,
583		    "%s: No local port available for auto bound\n",
584		    __func__);
585		error = EADDRINUSE;
586	}
587
588	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is ");
589	hvsock_print_guid(&pcb->vm_srv_id);
590	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is ");
591	hvsock_print_guid(&pcb->host_srv_id);
592
593out:
594	mtx_unlock(&hvs_trans_socks_mtx);
595
596	if (found_auto_bound_port == true)
597		 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id);
598
599	return (error);
600}
601
602int
603hvs_trans_disconnect(struct socket *so)
604{
605	struct hvs_pcb *pcb;
606
607	if (vm_guest != VM_GUEST_HV)
608		return (ESOCKTNOSUPPORT);
609
610	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
611	    "%s: HyperV Socket hvs_trans_disconnect called\n", __func__);
612
613	(void) hvs_trans_lock();
614	pcb = so2hvspcb(so);
615	if (pcb == NULL) {
616		hvs_trans_unlock();
617		return (EINVAL);
618	}
619
620	/* If socket is already disconnected, skip this */
621	if ((so->so_state & SS_ISDISCONNECTED) == 0)
622		soisdisconnecting(so);
623
624	hvs_trans_unlock();
625
626	return (0);
627}
628
629#define SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
630struct hvs_callback_arg {
631	struct uio *uio;
632	struct sockbuf *sb;
633};
634
635int
636hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr,
637    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
638{
639	struct hvs_pcb *pcb = so2hvspcb(so);
640	struct sockbuf *sb;
641	ssize_t orig_resid;
642	uint32_t canread, to_read;
643	int flags, error = 0;
644	struct hvs_callback_arg cbarg;
645
646	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
647	    "%s: HyperV Socket hvs_trans_soreceive called\n", __func__);
648
649	if (so->so_type != SOCK_STREAM)
650		return (EINVAL);
651	if (pcb == NULL)
652		return (EINVAL);
653
654	if (flagsp != NULL)
655		flags = *flagsp &~ MSG_EOR;
656	else
657		flags = 0;
658
659	if (flags & MSG_PEEK)
660		return (EOPNOTSUPP);
661
662	/* If no space to copy out anything */
663	if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ)
664		return (EINVAL);
665
666	sb = &so->so_rcv;
667
668	orig_resid = uio->uio_resid;
669
670	/* Prevent other readers from entering the socket. */
671	error = sblock(sb, SBLOCKWAIT(flags));
672	if (error) {
673		HVSOCK_DBG(HVSOCK_DBG_ERR,
674		    "%s: sblock returned error = %d\n", __func__, error);
675		return (error);
676	}
677
678	SOCKBUF_LOCK(sb);
679
680	cbarg.uio = uio;
681	cbarg.sb = sb;
682	/*
683	 * If the socket is closing, there might still be some data
684	 * in rx br to read. However we need to make sure
685	 * the channel is still open.
686	 */
687	if ((sb->sb_state & SBS_CANTRCVMORE) &&
688	    (so->so_state & SS_ISDISCONNECTED)) {
689		/* Other thread already closed the channel */
690		error = EPIPE;
691		goto out;
692	}
693
694	while (true) {
695		while (uio->uio_resid > 0 &&
696		    (canread = hvsock_canread_check(pcb)) > 0) {
697			to_read = MIN(canread, uio->uio_resid);
698			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
699			    "%s: to_read = %u, skip = %u\n", __func__, to_read,
700			    (unsigned int)(sizeof(struct hvs_pkt_header) +
701			    pcb->recv_data_off));
702
703			error = vmbus_chan_recv_peek_call(pcb->chan, to_read,
704			    sizeof(struct hvs_pkt_header) + pcb->recv_data_off,
705			    hvsock_br_callback, (void *)&cbarg);
706			/*
707			 * It is possible socket is disconnected becasue
708			 * we released lock in hvsock_br_callback. So we
709			 * need to check the state to make sure it is not
710			 * disconnected.
711			 */
712			if (error || so->so_state & SS_ISDISCONNECTED) {
713				break;
714			}
715
716			pcb->recv_data_len -= to_read;
717			pcb->recv_data_off += to_read;
718		}
719
720		if (error)
721			break;
722
723		/* Abort if socket has reported problems. */
724		if (so->so_error) {
725			if (so->so_error == ESHUTDOWN &&
726			    orig_resid > uio->uio_resid) {
727				/*
728				 * Although we got a FIN, we also received
729				 * some data in this round. Delivery it
730				 * to user.
731				 */
732				error = 0;
733			} else {
734				if (so->so_error != ESHUTDOWN)
735					error = so->so_error;
736			}
737
738			break;
739		}
740
741		/* Cannot received more. */
742		if (sb->sb_state & SBS_CANTRCVMORE)
743			break;
744
745		/* We are done if buffer has been filled */
746		if (uio->uio_resid == 0)
747			break;
748
749		if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid)
750			break;
751
752		/* Buffer ring is empty and we shall not block */
753		if ((so->so_state & SS_NBIO) ||
754		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
755			if (orig_resid == uio->uio_resid) {
756				/* We have not read anything */
757				error = EAGAIN;
758			}
759			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
760			    "%s: non blocked read return, error %d.\n",
761			    __func__, error);
762			break;
763		}
764
765		/*
766		 * Wait and block until (more) data comes in.
767		 * Note: Drops the sockbuf lock during wait.
768		 */
769		error = sbwait(sb);
770
771		if (error)
772			break;
773
774		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
775		    "%s: wake up from sbwait, read available is %u\n",
776		    __func__, vmbus_chan_read_available(pcb->chan));
777	}
778
779out:
780	SOCKBUF_UNLOCK(sb);
781
782	sbunlock(sb);
783
784	/* We recieved a FIN in this call */
785	if (so->so_error == ESHUTDOWN) {
786		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
787			/* Send has already closed */
788			soisdisconnecting(so);
789		} else {
790			/* Just close the receive side */
791			socantrcvmore(so);
792		}
793	}
794
795	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
796	    "%s: returning error = %d, so_error = %d\n",
797	    __func__, error, so->so_error);
798
799	return (error);
800}
801
802int
803hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
804    struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td)
805{
806	struct hvs_pcb *pcb = so2hvspcb(so);
807	struct sockbuf *sb;
808	ssize_t orig_resid;
809	uint32_t canwrite, to_write;
810	int error = 0;
811
812	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
813	    "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n",
814	    __func__, uio->uio_resid);
815
816	if (so->so_type != SOCK_STREAM)
817		return (EINVAL);
818	if (pcb == NULL)
819		return (EINVAL);
820
821	/* If nothing to send */
822	if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE)
823		return (EINVAL);
824
825	sb = &so->so_snd;
826
827	orig_resid = uio->uio_resid;
828
829	/* Prevent other writers from entering the socket. */
830	error = sblock(sb, SBLOCKWAIT(flags));
831	if (error) {
832		HVSOCK_DBG(HVSOCK_DBG_ERR,
833		    "%s: sblock returned error = %d\n", __func__, error);
834		return (error);
835	}
836
837	SOCKBUF_LOCK(sb);
838
839	if ((sb->sb_state & SBS_CANTSENDMORE) ||
840	    so->so_error == ESHUTDOWN) {
841		error = EPIPE;
842		goto out;
843	}
844
845	while (uio->uio_resid > 0) {
846		canwrite = hvsock_canwrite_check(pcb);
847		if (canwrite == 0) {
848			/* We have sent some data */
849			if (orig_resid > uio->uio_resid)
850				break;
851			/*
852			 * We have not sent any data and it is
853			 * non-blocked io
854			 */
855			if (so->so_state & SS_NBIO ||
856			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
857				error = EWOULDBLOCK;
858				break;
859			} else {
860				/*
861				 * We are here because there is no space on
862				 * send buffer ring. Signal the other side
863				 * to read and free more space.
864				 * Sleep wait until space avaiable to send
865				 * Note: Drops the sockbuf lock during wait.
866				 */
867				error = sbwait(sb);
868
869				if (error)
870					break;
871
872				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
873				    "%s: wake up from sbwait, space avail on "
874				    "tx ring is %u\n",
875				    __func__,
876				    vmbus_chan_write_available(pcb->chan));
877
878				continue;
879			}
880		}
881		to_write = MIN(canwrite, uio->uio_resid);
882		to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ);
883
884		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
885		    "%s: canwrite is %u, to_write = %u\n", __func__,
886		    canwrite, to_write);
887		error = hvsock_send_data(pcb->chan, uio, to_write, sb);
888
889		if (error)
890			break;
891	}
892
893out:
894	SOCKBUF_UNLOCK(sb);
895	sbunlock(sb);
896
897	return (error);
898}
899
900int
901hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam)
902{
903	struct hvs_pcb *pcb = so2hvspcb(so);
904
905	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
906	    "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__);
907
908	if (pcb == NULL)
909		return (EINVAL);
910
911	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT);
912
913	return ((*nam == NULL)? ENOMEM : 0);
914}
915
916int
917hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam)
918{
919	struct hvs_pcb *pcb = so2hvspcb(so);
920
921	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
922	    "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__);
923
924	if (pcb == NULL)
925		return (EINVAL);
926
927	*nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT);
928
929	return ((*nam == NULL)? ENOMEM : 0);
930}
931
932void
933hvs_trans_close(struct socket *so)
934{
935	struct hvs_pcb *pcb;
936
937	if (vm_guest != VM_GUEST_HV)
938		return;
939
940	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
941	    "%s: HyperV Socket hvs_trans_close called\n", __func__);
942
943	(void) hvs_trans_lock();
944	pcb = so2hvspcb(so);
945	if (!pcb) {
946		hvs_trans_unlock();
947		return;
948	}
949
950	if (so->so_state & SS_ISCONNECTED) {
951		/* Send a FIN to peer */
952		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
953		    "%s: hvs_trans_close sending a FIN to host\n", __func__);
954		(void) hvsock_send_data(pcb->chan, NULL, 0, NULL);
955	}
956
957	if (so->so_state &
958	    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
959		soisdisconnected(so);
960
961	pcb->chan = NULL;
962	pcb->so = NULL;
963
964	if (SOLISTENING(so)) {
965		mtx_lock(&hvs_trans_socks_mtx);
966		/* Remove from bound list */
967		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
968		mtx_unlock(&hvs_trans_socks_mtx);
969	}
970
971	hvs_trans_unlock();
972
973	return;
974}
975
976void
977hvs_trans_abort(struct socket *so)
978{
979	struct hvs_pcb *pcb = so2hvspcb(so);
980
981	if (vm_guest != VM_GUEST_HV)
982		return;
983
984	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
985	    "%s: HyperV Socket hvs_trans_abort called\n", __func__);
986
987	(void) hvs_trans_lock();
988	if (pcb == NULL) {
989		hvs_trans_unlock();
990		return;
991	}
992
993	if (SOLISTENING(so)) {
994		mtx_lock(&hvs_trans_socks_mtx);
995		/* Remove from bound list */
996		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
997		mtx_unlock(&hvs_trans_socks_mtx);
998	}
999
1000	if (so->so_state & SS_ISCONNECTED) {
1001		(void) sodisconnect(so);
1002	}
1003	hvs_trans_unlock();
1004
1005	return;
1006}
1007
1008int
1009hvs_trans_shutdown(struct socket *so)
1010{
1011	struct hvs_pcb *pcb = so2hvspcb(so);
1012	struct sockbuf *sb;
1013
1014	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1015	    "%s: HyperV Socket hvs_trans_shutdown called\n", __func__);
1016
1017	if (pcb == NULL)
1018		return (EINVAL);
1019
1020	/*
1021	 * Only get called with the shutdown method is SHUT_WR or
1022	 * SHUT_RDWR.
1023	 * When the method is SHUT_RD or SHUT_RDWR, the caller
1024	 * already set the SBS_CANTRCVMORE on receive side socket
1025	 * buffer.
1026	 */
1027	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1028		/*
1029		 * SHUT_WR only case.
1030		 * Receive side is still open. Just close
1031		 * the send side.
1032		 */
1033		socantsendmore(so);
1034	} else {
1035		/* SHUT_RDWR case */
1036		if (so->so_state & SS_ISCONNECTED) {
1037			/* Send a FIN to peer */
1038			sb = &so->so_snd;
1039			SOCKBUF_LOCK(sb);
1040			(void) hvsock_send_data(pcb->chan, NULL, 0, sb);
1041			SOCKBUF_UNLOCK(sb);
1042
1043			soisdisconnecting(so);
1044		}
1045	}
1046
1047	return (0);
1048}
1049
1050/* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is
1051 * <port> (see struct sockaddr_hvs).
1052 *
1053 * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV:
1054 * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user-
1055 * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with
1056 * the below sockaddr:
1057 *
1058 * struct SOCKADDR_HV
1059 * {
1060 *    ADDRESS_FAMILY Family;
1061 *    USHORT Reserved;
1062 *    GUID VmId;
1063 *    GUID ServiceId;
1064 * };
1065 * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via
1066 * VMBus, because here it's obvious the host and the VM can easily identify
1067 * each other. Though the VmID is useful on the host, especially in the case
1068 * of Windows container, FreeBSD VM doesn't need it at all.
1069 *
1070 * To be compatible with similar infrastructure in Linux VMs, we have
1071 * to limit the available GUID space of SOCKADDR_HV so that we can create
1072 * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID.
1073 * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is:
1074 *
1075 ****************************************************************************
1076 * The only valid Service GUIDs, from the perspectives of both the host and *
1077 * FreeBSD VM, that can be connected by the other end, must conform to this *
1078 * format: <port>-facb-11e6-bd58-64006a7986d3.                              *
1079 ****************************************************************************
1080 *
1081 * When we write apps on the host to connect(), the GUID ServiceID is used.
1082 * When we write apps in FreeBSD VM to connect(), we only need to specify the
1083 * port and the driver will form the GUID and use that to request the host.
1084 *
1085 * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the
1086 * auto-generated remote port for a connect request initiated by the host's
1087 * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the
1088 * FreeBSD guest.
1089 */
1090
1091/*
1092 * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before)
1093 * restricts HyperV socket ring buffer size to six 4K pages. Newer
1094 * HyperV hosts doen't have this limit.
1095 */
1096#define HVS_RINGBUF_RCV_SIZE	(PAGE_SIZE * 6)
1097#define HVS_RINGBUF_SND_SIZE	(PAGE_SIZE * 6)
1098#define HVS_RINGBUF_MAX_SIZE	(PAGE_SIZE * 64)
1099
1100struct hvsock_sc {
1101	device_t		dev;
1102	struct hvs_pcb		*pcb;
1103	struct vmbus_channel	*channel;
1104};
1105
1106static bool
1107hvsock_chan_readable(struct vmbus_channel *chan)
1108{
1109	uint32_t readable = vmbus_chan_read_available(chan);
1110
1111	return (readable >= HVSOCK_PKT_LEN(0));
1112}
1113
1114static void
1115hvsock_chan_cb(struct vmbus_channel *chan, void *context)
1116{
1117	struct hvs_pcb *pcb = (struct hvs_pcb *) context;
1118	struct socket *so;
1119	uint32_t canwrite;
1120
1121	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1122	    "%s: host send us a wakeup on rb data, pcb = %p\n",
1123	    __func__, pcb);
1124
1125	/*
1126	 * Check if the socket is still attached and valid.
1127	 * Here we know channel is still open. Need to make
1128	 * sure the socket has not been closed or freed.
1129	 */
1130	(void) hvs_trans_lock();
1131	so = hsvpcb2so(pcb);
1132
1133	if (pcb->chan != NULL && so != NULL) {
1134		/*
1135		 * Wake up reader if there are data to read.
1136		 */
1137		SOCKBUF_LOCK(&(so)->so_rcv);
1138
1139		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1140		    "%s: read available = %u\n", __func__,
1141		    vmbus_chan_read_available(pcb->chan));
1142
1143		if (hvsock_chan_readable(pcb->chan))
1144			sorwakeup_locked(so);
1145		else
1146			SOCKBUF_UNLOCK(&(so)->so_rcv);
1147
1148		/*
1149		 * Wake up sender if space becomes available to write.
1150		 */
1151		SOCKBUF_LOCK(&(so)->so_snd);
1152		canwrite = hvsock_canwrite_check(pcb);
1153
1154		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1155		    "%s: canwrite = %u\n", __func__, canwrite);
1156
1157		if (canwrite > 0) {
1158			sowwakeup_locked(so);
1159		} else {
1160			SOCKBUF_UNLOCK(&(so)->so_snd);
1161		}
1162	}
1163
1164	hvs_trans_unlock();
1165
1166	return;
1167}
1168
1169static int
1170hvsock_br_callback(void *datap, int cplen, void *cbarg)
1171{
1172	struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg;
1173	struct uio *uio = arg->uio;
1174	struct sockbuf *sb = arg->sb;
1175	int error = 0;
1176
1177	if (cbarg == NULL || datap == NULL)
1178		return (EINVAL);
1179
1180	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1181	    "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, "
1182	    "datap = %p\n",
1183	    __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br",
1184	    uio->uio_resid, cplen, datap);
1185
1186	if (sb)
1187		SOCKBUF_UNLOCK(sb);
1188
1189	error = uiomove(datap, cplen, uio);
1190
1191	if (sb)
1192		SOCKBUF_LOCK(sb);
1193
1194	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1195	    "%s: after uiomove, uio_resid = %zd, error = %d\n",
1196	    __func__, uio->uio_resid, error);
1197
1198	return (error);
1199}
1200
1201static int
1202hvsock_send_data(struct vmbus_channel *chan, struct uio *uio,
1203    uint32_t to_write, struct sockbuf *sb)
1204{
1205	struct hvs_pkt_header hvs_pkt;
1206	int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0;
1207	uint64_t pad = 0;
1208	struct iovec iov[3];
1209	struct hvs_callback_arg cbarg;
1210
1211	if (chan == NULL)
1212		return (ENOTCONN);
1213
1214	hlen = sizeof(struct vmbus_chanpkt_hdr);
1215	hvs_pkthlen = sizeof(struct hvs_pkt_header);
1216	hvs_pktlen = hvs_pkthlen + to_write;
1217	pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen);
1218
1219	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1220	    "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, "
1221	    "pad_pktlen = %u, data_len = %u\n",
1222	    __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write);
1223
1224	hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND;
1225	hvs_pkt.chan_pkt_hdr.cph_flags = 0;
1226	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen);
1227	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen);
1228	hvs_pkt.chan_pkt_hdr.cph_xactid = 0;
1229
1230	hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1;
1231	hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write;
1232
1233	cbarg.uio = uio;
1234	cbarg.sb = sb;
1235
1236	if (uio && to_write > 0) {
1237		iov[0].iov_base = &hvs_pkt;
1238		iov[0].iov_len = hvs_pkthlen;
1239		iov[1].iov_base = NULL;
1240		iov[1].iov_len = to_write;
1241		iov[2].iov_base = &pad;
1242		iov[2].iov_len = pad_pktlen - hvs_pktlen;
1243
1244		error = vmbus_chan_iov_send(chan, iov, 3,
1245		    hvsock_br_callback, &cbarg);
1246	} else {
1247		if (to_write == 0) {
1248			iov[0].iov_base = &hvs_pkt;
1249			iov[0].iov_len = hvs_pkthlen;
1250			iov[1].iov_base = &pad;
1251			iov[1].iov_len = pad_pktlen - hvs_pktlen;
1252			error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL);
1253		}
1254	}
1255
1256	if (error) {
1257		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1258		    "%s: error = %d\n", __func__, error);
1259	}
1260
1261	return (error);
1262}
1263
1264/*
1265 * Check if we have data on current ring buffer to read
1266 * or not. If not, advance the ring buffer read index to
1267 * next packet. Update the recev_data_len and recev_data_off
1268 * to new value.
1269 * Return the number of bytes can read.
1270 */
1271static uint32_t
1272hvsock_canread_check(struct hvs_pcb *pcb)
1273{
1274	uint32_t advance;
1275	uint32_t tlen, hlen, dlen;
1276	uint32_t bytes_canread = 0;
1277	int error;
1278
1279	if (pcb == NULL || pcb->chan == NULL) {
1280		pcb->so->so_error = EIO;
1281		return (0);
1282	}
1283
1284	/* Still have data not read yet on current packet */
1285	if (pcb->recv_data_len > 0)
1286		return (pcb->recv_data_len);
1287
1288	if (pcb->rb_init)
1289		advance =
1290		    VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1291	else
1292		advance = 0;
1293
1294	bytes_canread = vmbus_chan_read_available(pcb->chan);
1295
1296	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1297	    "%s: bytes_canread on br = %u, advance = %u\n",
1298	    __func__, bytes_canread, advance);
1299
1300	if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) {
1301		/*
1302		 * Nothing to read. Need to advance the rindex before
1303		 * calling sbwait, so host knows to wake us up when data
1304		 * is available to read on rb.
1305		 */
1306		error = vmbus_chan_recv_idxadv(pcb->chan, advance);
1307		if (error) {
1308			HVSOCK_DBG(HVSOCK_DBG_ERR,
1309			    "%s: after calling vmbus_chan_recv_idxadv, "
1310			    "got error = %d\n",  __func__, error);
1311			return (0);
1312		} else {
1313			pcb->rb_init = false;
1314			pcb->recv_data_len = 0;
1315			pcb->recv_data_off = 0;
1316			bytes_canread = vmbus_chan_read_available(pcb->chan);
1317
1318			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1319			    "%s: advanced %u bytes, "
1320			    " bytes_canread on br now = %u\n",
1321			    __func__, advance, bytes_canread);
1322
1323			if (bytes_canread == 0)
1324				return (0);
1325			else
1326				advance = 0;
1327		}
1328	}
1329
1330	if (bytes_canread <
1331	    advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t)))
1332		return (0);
1333
1334	error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt,
1335	    sizeof(struct hvs_pkt_header), advance);
1336
1337	/* Don't have anything to read */
1338	if (error) {
1339		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1340		    "%s: after calling vmbus_chan_recv_peek, got error = %d\n",
1341		    __func__, error);
1342		return (0);
1343	}
1344
1345	/*
1346	 * We just read in a new packet header. Do some sanity checks.
1347	 */
1348	tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1349	hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen);
1350	dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size;
1351	if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) ||
1352	    __predict_false(hlen > tlen) ||
1353	    __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) {
1354		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1355		    "invalid tlen(%u), hlen(%u) or dlen(%u)\n",
1356		    tlen, hlen, dlen);
1357		pcb->so->so_error = EIO;
1358		return (0);
1359	}
1360	if (pcb->rb_init == false)
1361		pcb->rb_init = true;
1362
1363	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1364	    "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n",
1365	    tlen, hlen, dlen);
1366
1367	/* The other side has sent a close FIN */
1368	if (dlen == 0) {
1369		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1370		    "%s: Received FIN from other side\n", __func__);
1371		/* inform the caller by seting so_error to ESHUTDOWN */
1372		pcb->so->so_error = ESHUTDOWN;
1373	}
1374
1375	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1376	    "%s: canread on receive ring is %u \n", __func__, dlen);
1377
1378	pcb->recv_data_len = dlen;
1379	pcb->recv_data_off = 0;
1380
1381	return (pcb->recv_data_len);
1382}
1383
1384static uint32_t
1385hvsock_canwrite_check(struct hvs_pcb *pcb)
1386{
1387	uint32_t writeable;
1388	uint32_t ret;
1389
1390	if (pcb == NULL || pcb->chan == NULL)
1391		return (0);
1392
1393	writeable = vmbus_chan_write_available(pcb->chan);
1394
1395	/*
1396	 * We must always reserve a 0-length-payload packet for the FIN.
1397	 */
1398	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1399	    "%s: writeable is %u, should be greater than %ju\n",
1400	    __func__, writeable,
1401	    (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)));
1402
1403	if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) {
1404		/*
1405		 * The Tx ring seems full.
1406		 */
1407		return (0);
1408	}
1409
1410	ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0);
1411
1412	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1413	    "%s: available size is %u\n", __func__, rounddown2(ret, 8));
1414
1415	return (rounddown2(ret, 8));
1416}
1417
1418static void
1419hvsock_set_chan_pending_send_size(struct vmbus_channel *chan)
1420{
1421	vmbus_chan_set_pending_send_size(chan,
1422	    HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ));
1423}
1424
1425static int
1426hvsock_open_channel(struct vmbus_channel *chan, struct socket *so)
1427{
1428	unsigned int rcvbuf, sndbuf;
1429	struct hvs_pcb *pcb = so2hvspcb(so);
1430	int ret;
1431
1432	if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) {
1433		sndbuf = HVS_RINGBUF_SND_SIZE;
1434		rcvbuf = HVS_RINGBUF_RCV_SIZE;
1435	} else {
1436		sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE);
1437		sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE);
1438		sndbuf = rounddown2(sndbuf, PAGE_SIZE);
1439		rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE);
1440		rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE);
1441		rcvbuf = rounddown2(rcvbuf, PAGE_SIZE);
1442	}
1443
1444	/*
1445	 * Can only read whatever user provided size of data
1446	 * from ring buffer. Turn off batched reading.
1447	 */
1448	vmbus_chan_set_readbatch(chan, false);
1449
1450	ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0,
1451	    hvsock_chan_cb, pcb);
1452
1453	if (ret != 0) {
1454		HVSOCK_DBG(HVSOCK_DBG_ERR,
1455		    "%s: failed to open hvsock channel, sndbuf = %u, "
1456		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1457	} else {
1458		HVSOCK_DBG(HVSOCK_DBG_INFO,
1459		    "%s: hvsock channel opened, sndbuf = %u, i"
1460		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1461		/*
1462		 * Se the pending send size so to receive wakeup
1463		 * signals from host when there is enough space on
1464		 * rx buffer ring to write.
1465		 */
1466		hvsock_set_chan_pending_send_size(chan);
1467	}
1468
1469	return ret;
1470}
1471
1472/*
1473 * Guest is listening passively on the socket. Open channel and
1474 * create a new socket for the conneciton.
1475 */
1476static void
1477hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so,
1478    struct hvsock_sc *sc)
1479{
1480	struct socket *new_so;
1481	struct hvs_pcb *new_pcb, *pcb;
1482	int error;
1483
1484	/* Do nothing if socket is not listening */
1485	if (!SOLISTENING(so)) {
1486		HVSOCK_DBG(HVSOCK_DBG_ERR,
1487		    "%s: socket is not a listening one\n", __func__);
1488		return;
1489	}
1490
1491	/*
1492	 * Create a new socket. This will call pru_attach to complete
1493	 * the socket initialization and put the new socket onto
1494	 * listening socket's sol_incomp list, waiting to be promoted
1495	 * to sol_comp list.
1496	 * The new socket created has ref count 0. There is no other
1497	 * thread that changes the state of this new one at the
1498	 * moment, so we don't need to hold its lock while opening
1499	 * channel and filling out its pcb information.
1500	 */
1501	new_so = sonewconn(so, 0);
1502	if (!new_so)
1503		HVSOCK_DBG(HVSOCK_DBG_ERR,
1504		    "%s: creating new socket failed\n", __func__);
1505
1506	/*
1507	 * Now open the vmbus channel. If it fails, the socket will be
1508	 * on the listening socket's sol_incomp queue until it is
1509	 * replaced and aborted.
1510	 */
1511	error = hvsock_open_channel(chan, new_so);
1512	if (error) {
1513		new_so->so_error = error;
1514		return;
1515	}
1516
1517	pcb = so->so_pcb;
1518	new_pcb = new_so->so_pcb;
1519
1520	hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port);
1521	/* Remote port is unknown to guest in this type of conneciton */
1522	hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN);
1523	new_pcb->chan = chan;
1524	new_pcb->recv_data_len = 0;
1525	new_pcb->recv_data_off = 0;
1526	new_pcb->rb_init = false;
1527
1528	new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan);
1529	new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan);
1530
1531	hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED);
1532
1533	sc->pcb = new_pcb;
1534
1535	/*
1536	 * Change the socket state to SS_ISCONNECTED. This will promote
1537	 * the socket to sol_comp queue and wake up the thread which
1538	 * is accepting connection.
1539	 */
1540	soisconnected(new_so);
1541}
1542
1543
1544/*
1545 * Guest is actively connecting to host.
1546 */
1547static void
1548hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so)
1549{
1550	struct hvs_pcb *pcb;
1551	int error;
1552
1553	error = hvsock_open_channel(chan, so);
1554	if (error) {
1555		so->so_error = error;
1556		return;
1557	}
1558
1559	pcb = so->so_pcb;
1560	pcb->chan = chan;
1561	pcb->recv_data_len = 0;
1562	pcb->recv_data_off = 0;
1563	pcb->rb_init = false;
1564
1565	mtx_lock(&hvs_trans_socks_mtx);
1566	__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
1567	__hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED);
1568	mtx_unlock(&hvs_trans_socks_mtx);
1569
1570	/*
1571	 * Change the socket state to SS_ISCONNECTED. This will wake up
1572	 * the thread sleeping in connect call.
1573	 */
1574	soisconnected(so);
1575}
1576
1577static void
1578hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc)
1579{
1580	struct hyperv_guid *inst_guid, *type_guid;
1581	bool conn_from_host;
1582	struct sockaddr_hvs addr;
1583	struct socket *so;
1584	struct hvs_pcb *pcb;
1585
1586	type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan);
1587	inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan);
1588	conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan);
1589
1590	HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is ");
1591	hvsock_print_guid(type_guid);
1592	HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is ");
1593	hvsock_print_guid(inst_guid);
1594	HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n",
1595	    (conn_from_host == true ) ? "from" : "to");
1596
1597	/*
1598	 * The listening port should be in [0, MAX_LISTEN_PORT]
1599	 */
1600	if (!is_valid_srv_id(type_guid))
1601		return;
1602
1603	/*
1604	 * There should be a bound socket already created no matter
1605	 * it is a passive or active connection.
1606	 * For host initiated connection (passive on guest side),
1607	 * the  type_guid contains the port which guest is bound and
1608	 * listening.
1609	 * For the guest initiated connection (active on guest side),
1610	 * the inst_guid contains the port that guest has auto bound
1611	 * to.
1612	 */
1613	hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid);
1614	so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND);
1615	if (!so) {
1616		HVSOCK_DBG(HVSOCK_DBG_ERR,
1617		    "%s: no bound socket found for port %u\n",
1618		    __func__, addr.hvs_port);
1619		return;
1620	}
1621
1622	if (conn_from_host) {
1623		hvsock_open_conn_passive(chan, so, sc);
1624	} else {
1625		(void) hvs_trans_lock();
1626		pcb = so->so_pcb;
1627		if (pcb && pcb->so) {
1628			sc->pcb = so2hvspcb(so);
1629			hvsock_open_conn_active(chan, so);
1630		} else {
1631			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1632			    "%s: channel detached before open\n", __func__);
1633		}
1634		hvs_trans_unlock();
1635	}
1636
1637}
1638
1639static int
1640hvsock_probe(device_t dev)
1641{
1642	struct vmbus_channel *channel = vmbus_get_channel(dev);
1643
1644	if (!channel || !vmbus_chan_is_hvs(channel)) {
1645		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1646		    "hvsock_probe called but not a hvsock channel id %u\n",
1647		    vmbus_chan_id(channel));
1648
1649		return ENXIO;
1650	} else {
1651		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1652		    "hvsock_probe got a hvsock channel id %u\n",
1653		    vmbus_chan_id(channel));
1654
1655		return BUS_PROBE_DEFAULT;
1656	}
1657}
1658
1659static int
1660hvsock_attach(device_t dev)
1661{
1662	struct vmbus_channel *channel = vmbus_get_channel(dev);
1663	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1664
1665	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n");
1666
1667	hvsock_open_connection(channel, sc);
1668
1669	/*
1670	 * Always return success. On error the host will rescind the device
1671	 * in 30 seconds and we can do cleanup at that time in
1672	 * vmbus_chan_msgproc_chrescind().
1673	 */
1674	return (0);
1675}
1676
1677static int
1678hvsock_detach(device_t dev)
1679{
1680	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1681	struct socket *so;
1682	int error, retry;
1683
1684	if (bootverbose)
1685		device_printf(dev, "hvsock_detach called.\n");
1686
1687	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n");
1688
1689	if (sc->pcb != NULL) {
1690		(void) hvs_trans_lock();
1691
1692		so = hsvpcb2so(sc->pcb);
1693		if (so) {
1694			/* Close the connection */
1695			if (so->so_state &
1696			    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
1697				soisdisconnected(so);
1698		}
1699
1700		mtx_lock(&hvs_trans_socks_mtx);
1701		__hvs_remove_pcb_from_list(sc->pcb,
1702		    HVS_LIST_BOUND | HVS_LIST_CONNECTED);
1703		mtx_unlock(&hvs_trans_socks_mtx);
1704
1705		/*
1706		 * Close channel while no reader and sender are working
1707		 * on the buffer rings.
1708		 */
1709		if (so) {
1710			retry = 0;
1711			while ((error = sblock(&so->so_rcv, 0)) ==
1712			    EWOULDBLOCK) {
1713				/*
1714				 * Someone is reading, rx br is busy
1715				 */
1716				soisdisconnected(so);
1717				DELAY(500);
1718				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1719				    "waiting for rx reader to exit, "
1720				    "retry = %d\n", retry++);
1721			}
1722			retry = 0;
1723			while ((error = sblock(&so->so_snd, 0)) ==
1724			    EWOULDBLOCK) {
1725				/*
1726				 * Someone is sending, tx br is busy
1727				 */
1728				soisdisconnected(so);
1729				DELAY(500);
1730				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1731				    "waiting for tx sender to exit, "
1732				    "retry = %d\n", retry++);
1733			}
1734		}
1735
1736
1737		bzero(sc->pcb, sizeof(struct hvs_pcb));
1738		free(sc->pcb, M_HVSOCK);
1739		sc->pcb = NULL;
1740
1741		if (so) {
1742			sbunlock(&so->so_rcv);
1743			sbunlock(&so->so_snd);
1744			so->so_pcb = NULL;
1745		}
1746
1747		hvs_trans_unlock();
1748	}
1749
1750	vmbus_chan_close(vmbus_get_channel(dev));
1751
1752	return (0);
1753}
1754
1755static device_method_t hvsock_methods[] = {
1756	/* Device interface */
1757	DEVMETHOD(device_probe, hvsock_probe),
1758	DEVMETHOD(device_attach, hvsock_attach),
1759	DEVMETHOD(device_detach, hvsock_detach),
1760	DEVMETHOD_END
1761};
1762
1763static driver_t hvsock_driver = {
1764	"hv_sock",
1765	hvsock_methods,
1766	sizeof(struct hvsock_sc)
1767};
1768
1769static devclass_t hvsock_devclass;
1770
1771DRIVER_MODULE(hvsock, vmbus, hvsock_driver, hvsock_devclass, NULL, NULL);
1772MODULE_VERSION(hvsock, 1);
1773MODULE_DEPEND(hvsock, vmbus, 1, 1, 1);
1774