1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2020 Microsoft Corp.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice unmodified, this list of conditions, and the following
12 *    disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/param.h>
30#include <sys/bus.h>
31#include <sys/domain.h>
32#include <sys/lock.h>
33#include <sys/kernel.h>
34#include <sys/types.h>
35#include <sys/malloc.h>
36#include <sys/module.h>
37#include <sys/mutex.h>
38#include <sys/proc.h>
39#include <sys/protosw.h>
40#include <sys/socket.h>
41#include <sys/sysctl.h>
42#include <sys/sysproto.h>
43#include <sys/systm.h>
44#include <sys/sockbuf.h>
45#include <sys/sx.h>
46#include <sys/uio.h>
47
48#include <net/vnet.h>
49
50#include <dev/hyperv/vmbus/vmbus_reg.h>
51
52#include "hv_sock.h"
53
54#define HVSOCK_DBG_NONE			0x0
55#define HVSOCK_DBG_INFO			0x1
56#define HVSOCK_DBG_ERR			0x2
57#define HVSOCK_DBG_VERBOSE		0x3
58
59
60SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket");
61
62static int hvs_dbg_level;
63SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level,
64    0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose");
65
66
67#define HVSOCK_DBG(level, ...) do {					\
68	if (hvs_dbg_level >= (level))					\
69		printf(__VA_ARGS__);					\
70	} while (0)
71
72MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures");
73
74static int hvs_dom_probe(void);
75
76/* The MTU is 16KB per host side's design */
77#define HVSOCK_MTU_SIZE		(1024 * 16)
78#define HVSOCK_SEND_BUF_SZ	(PAGE_SIZE - sizeof(struct vmpipe_proto_header))
79
80#define HVSOCK_HEADER_LEN	(sizeof(struct hvs_pkt_header))
81
82#define HVSOCK_PKT_LEN(payload_len)	(HVSOCK_HEADER_LEN + \
83					 roundup2(payload_len, 8) + \
84					 sizeof(uint64_t))
85
86/*
87 * HyperV Transport sockets
88 */
89static struct protosw hv_socket_protosw = {
90	.pr_type =		SOCK_STREAM,
91	.pr_protocol =		HYPERV_SOCK_PROTO_TRANS,
92	.pr_flags =		PR_CONNREQUIRED,
93	.pr_attach =		hvs_trans_attach,
94	.pr_bind =		hvs_trans_bind,
95	.pr_listen =		hvs_trans_listen,
96	.pr_accept =		hvs_trans_accept,
97	.pr_connect =		hvs_trans_connect,
98	.pr_peeraddr =		hvs_trans_peeraddr,
99	.pr_sockaddr =		hvs_trans_sockaddr,
100	.pr_soreceive =		hvs_trans_soreceive,
101	.pr_sosend =		hvs_trans_sosend,
102	.pr_disconnect =	hvs_trans_disconnect,
103	.pr_close =		hvs_trans_close,
104	.pr_detach =		hvs_trans_detach,
105	.pr_shutdown =		hvs_trans_shutdown,
106	.pr_abort =		hvs_trans_abort,
107};
108
109static struct domain		hv_socket_domain = {
110	.dom_family =		AF_HYPERV,
111	.dom_name =		"hyperv",
112	.dom_probe =		hvs_dom_probe,
113	.dom_nprotosw =		1,
114	.dom_protosw =		{ &hv_socket_protosw },
115};
116
117DOMAIN_SET(hv_socket_);
118
119#define MAX_PORT			((uint32_t)0xFFFFFFFF)
120#define MIN_PORT			((uint32_t)0x0)
121
122/* 00000000-facb-11e6-bd58-64006a7986d3 */
123static const struct hyperv_guid srv_id_template = {
124	.hv_guid = {
125	    0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11,
126	    0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 }
127};
128
129static int		hvsock_br_callback(void *, int, void *);
130static uint32_t		hvsock_canread_check(struct hvs_pcb *);
131static uint32_t		hvsock_canwrite_check(struct hvs_pcb *);
132static int		hvsock_send_data(struct vmbus_channel *chan,
133    struct uio *uio, uint32_t to_write, struct sockbuf *sb);
134
135
136
137/* Globals */
138static struct sx		hvs_trans_socks_sx;
139static struct mtx		hvs_trans_socks_mtx;
140static LIST_HEAD(, hvs_pcb)	hvs_trans_bound_socks;
141static LIST_HEAD(, hvs_pcb)	hvs_trans_connected_socks;
142static uint32_t			previous_auto_bound_port;
143
144static void
145hvsock_print_guid(struct hyperv_guid *guid)
146{
147	unsigned char *p = (unsigned char *)guid;
148
149	HVSOCK_DBG(HVSOCK_DBG_INFO,
150	    "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n",
151	    *(unsigned int *)p,
152	    *((unsigned short *) &p[4]),
153	    *((unsigned short *) &p[6]),
154	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
155}
156
157static bool
158is_valid_srv_id(const struct hyperv_guid *id)
159{
160	return !memcmp(&id->hv_guid[4],
161	    &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4);
162}
163
164static unsigned int
165get_port_by_srv_id(const struct hyperv_guid *srv_id)
166{
167	return *((const unsigned int *)srv_id);
168}
169
170static void
171set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port)
172{
173	*((unsigned int *)srv_id) = port;
174}
175
176
177static void
178__hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list)
179{
180	struct hvs_pcb *p = NULL;
181
182	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
183
184	if (!pcb)
185		return;
186
187	if (list & HVS_LIST_BOUND) {
188		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
189			if  (p == pcb)
190				LIST_REMOVE(p, bound_next);
191	}
192
193	if (list & HVS_LIST_CONNECTED) {
194		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
195			if (p == pcb)
196				LIST_REMOVE(pcb, connected_next);
197	}
198}
199
200static void
201__hvs_remove_socket_from_list(struct socket *so, unsigned char list)
202{
203	struct hvs_pcb *pcb = so2hvspcb(so);
204
205	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
206
207	__hvs_remove_pcb_from_list(pcb, list);
208}
209
210static void
211__hvs_insert_socket_on_list(struct socket *so, unsigned char list)
212{
213	struct hvs_pcb *pcb = so2hvspcb(so);
214
215	if (list & HVS_LIST_BOUND)
216		LIST_INSERT_HEAD(&hvs_trans_bound_socks,
217		   pcb, bound_next);
218
219	if (list & HVS_LIST_CONNECTED)
220		LIST_INSERT_HEAD(&hvs_trans_connected_socks,
221		   pcb, connected_next);
222}
223
224void
225hvs_remove_socket_from_list(struct socket *so, unsigned char list)
226{
227	if (!so || !so->so_pcb) {
228		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
229		    "%s: socket or so_pcb is null\n", __func__);
230		return;
231	}
232
233	mtx_lock(&hvs_trans_socks_mtx);
234	__hvs_remove_socket_from_list(so, list);
235	mtx_unlock(&hvs_trans_socks_mtx);
236}
237
238static void
239hvs_insert_socket_on_list(struct socket *so, unsigned char list)
240{
241	if (!so || !so->so_pcb) {
242		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
243		    "%s: socket or so_pcb is null\n", __func__);
244		return;
245	}
246
247	mtx_lock(&hvs_trans_socks_mtx);
248	__hvs_insert_socket_on_list(so, list);
249	mtx_unlock(&hvs_trans_socks_mtx);
250}
251
252static struct socket *
253__hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
254{
255	struct hvs_pcb *p = NULL;
256
257	if (list & HVS_LIST_BOUND)
258		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
259			if (p->so != NULL &&
260			    addr->hvs_port == p->local_addr.hvs_port)
261				return p->so;
262
263	if (list & HVS_LIST_CONNECTED)
264		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
265			if (p->so != NULL &&
266			    addr->hvs_port == p->local_addr.hvs_port)
267				return p->so;
268
269	return NULL;
270}
271
272static struct socket *
273hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
274{
275	struct socket *s = NULL;
276
277	mtx_lock(&hvs_trans_socks_mtx);
278	s = __hvs_find_socket_on_list(addr, list);
279	mtx_unlock(&hvs_trans_socks_mtx);
280
281	return s;
282}
283
284static inline void
285hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port)
286{
287	memset(addr, 0, sizeof(*addr));
288	addr->sa_family = AF_HYPERV;
289	addr->sa_len = sizeof(*addr);
290	addr->hvs_port = port;
291}
292
293void
294hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id)
295{
296	hvs_addr_set(addr, get_port_by_srv_id(svr_id));
297}
298
299int
300hvs_trans_lock(void)
301{
302	sx_xlock(&hvs_trans_socks_sx);
303	return (0);
304}
305
306void
307hvs_trans_unlock(void)
308{
309	sx_xunlock(&hvs_trans_socks_sx);
310}
311
312static int
313hvs_dom_probe(void)
314{
315
316	/* Don't even give us a chance to attach on non-HyperV. */
317	if (vm_guest != VM_GUEST_HV)
318		return (ENXIO);
319	return (0);
320}
321
322static void
323hvs_trans_init(void *arg __unused)
324{
325
326	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
327	    "%s: HyperV Socket hvs_trans_init called\n", __func__);
328
329	/* Initialize Globals */
330	previous_auto_bound_port = MAX_PORT;
331	sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx");
332	mtx_init(&hvs_trans_socks_mtx,
333	    "hvs_trans_socks_mtx", NULL, MTX_DEF);
334	LIST_INIT(&hvs_trans_bound_socks);
335	LIST_INIT(&hvs_trans_connected_socks);
336}
337SYSINIT(hvs_trans_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
338    hvs_trans_init, NULL);
339
340/*
341 * Called in two cases:
342 * 1) When user calls socket();
343 * 2) When we accept new incoming conneciton and call sonewconn().
344 */
345int
346hvs_trans_attach(struct socket *so, int proto, struct thread *td)
347{
348	struct hvs_pcb *pcb = so2hvspcb(so);
349
350	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
351	    "%s: HyperV Socket hvs_trans_attach called\n", __func__);
352
353	if (so->so_type != SOCK_STREAM)
354		return (ESOCKTNOSUPPORT);
355
356	if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS)
357		return (EPROTONOSUPPORT);
358
359	if (pcb != NULL)
360		return (EISCONN);
361	pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO);
362	if (pcb == NULL)
363		return (ENOMEM);
364
365	pcb->so = so;
366	so->so_pcb = (void *)pcb;
367
368	return (0);
369}
370
371void
372hvs_trans_detach(struct socket *so)
373{
374	struct hvs_pcb *pcb;
375
376	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
377	    "%s: HyperV Socket hvs_trans_detach called\n", __func__);
378
379	(void) hvs_trans_lock();
380	pcb = so2hvspcb(so);
381	if (pcb == NULL) {
382		hvs_trans_unlock();
383		return;
384	}
385
386	if (SOLISTENING(so)) {
387		bzero(pcb, sizeof(*pcb));
388		free(pcb, M_HVSOCK);
389	}
390
391	so->so_pcb = NULL;
392
393	hvs_trans_unlock();
394}
395
396int
397hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td)
398{
399	struct hvs_pcb *pcb = so2hvspcb(so);
400	struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr;
401	int error = 0;
402
403	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
404	    "%s: HyperV Socket hvs_trans_bind called\n", __func__);
405
406	if (sa == NULL) {
407		return (EINVAL);
408	}
409
410	if (pcb == NULL) {
411		return (EINVAL);
412	}
413
414	if (sa->sa_family != AF_HYPERV) {
415		HVSOCK_DBG(HVSOCK_DBG_ERR,
416		    "%s: Not supported, sa_family is %u\n",
417		    __func__, sa->sa_family);
418		return (EAFNOSUPPORT);
419	}
420	if (sa->sa_len != sizeof(*sa)) {
421		HVSOCK_DBG(HVSOCK_DBG_ERR,
422		    "%s: Not supported, sa_len is %u\n",
423		    __func__, sa->sa_len);
424		return (EINVAL);
425	}
426
427	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
428	    "%s: binding port = 0x%x\n", __func__, sa->hvs_port);
429
430	mtx_lock(&hvs_trans_socks_mtx);
431	if (__hvs_find_socket_on_list(sa,
432	    HVS_LIST_BOUND | HVS_LIST_CONNECTED)) {
433		error = EADDRINUSE;
434	} else {
435		/*
436		 * The address is available for us to bind.
437		 * Add socket to the bound list.
438		 */
439		hvs_addr_set(&pcb->local_addr, sa->hvs_port);
440		hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY);
441		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
442	}
443	mtx_unlock(&hvs_trans_socks_mtx);
444
445	return (error);
446}
447
448int
449hvs_trans_listen(struct socket *so, int backlog, struct thread *td)
450{
451	struct hvs_pcb *pcb = so2hvspcb(so);
452	struct socket *bound_so;
453	int error;
454
455	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
456	    "%s: HyperV Socket hvs_trans_listen called\n", __func__);
457
458	if (pcb == NULL)
459		return (EINVAL);
460
461	/* Check if the address is already bound and it was by us. */
462	bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND);
463	if (bound_so == NULL || bound_so != so) {
464		HVSOCK_DBG(HVSOCK_DBG_ERR,
465		    "%s: Address not bound or not by us.\n", __func__);
466		return (EADDRNOTAVAIL);
467	}
468
469	SOCK_LOCK(so);
470	error = solisten_proto_check(so);
471	if (error == 0)
472		solisten_proto(so, backlog);
473	SOCK_UNLOCK(so);
474
475	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
476	    "%s: HyperV Socket listen error = %d\n", __func__, error);
477	return (error);
478}
479
480int
481hvs_trans_accept(struct socket *so, struct sockaddr *sa)
482{
483	struct hvs_pcb *pcb = so2hvspcb(so);
484
485	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
486	    "%s: HyperV Socket hvs_trans_accept called\n", __func__);
487
488	if (pcb == NULL)
489		return (EINVAL);
490
491	memcpy(sa, &pcb->remote_addr, pcb->remote_addr.sa_len);
492
493	return (0);
494}
495
496int
497hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
498{
499	struct hvs_pcb *pcb = so2hvspcb(so);
500	struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam;
501	bool found_auto_bound_port = false;
502	int i, error = 0;
503
504	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
505	    "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n",
506	    __func__, raddr->hvs_port);
507
508	if (pcb == NULL)
509		return (EINVAL);
510
511	/* Verify the remote address */
512	if (raddr == NULL)
513		return (EINVAL);
514	if (raddr->sa_family != AF_HYPERV)
515		return (EAFNOSUPPORT);
516	if (raddr->sa_len != sizeof(*raddr))
517		return (EINVAL);
518
519	mtx_lock(&hvs_trans_socks_mtx);
520	if (so->so_state &
521	    (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) {
522			HVSOCK_DBG(HVSOCK_DBG_ERR,
523			    "%s: socket connect in progress\n",
524			    __func__);
525			error = EINPROGRESS;
526			goto out;
527	}
528
529	/*
530	 * Find an available port for us to auto bind the local
531	 * address.
532	 */
533	hvs_addr_set(&pcb->local_addr, 0);
534
535	for (i = previous_auto_bound_port - 1;
536	    i != previous_auto_bound_port; i --) {
537		if (i == MIN_PORT)
538			i = MAX_PORT;
539
540		pcb->local_addr.hvs_port = i;
541
542		if (__hvs_find_socket_on_list(&pcb->local_addr,
543		    HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) {
544			found_auto_bound_port = true;
545			previous_auto_bound_port = i;
546			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
547			    "%s: found local bound port is %x\n",
548			    __func__, pcb->local_addr.hvs_port);
549			break;
550		}
551	}
552
553	if (found_auto_bound_port == true) {
554		/* Found available port for auto bound, put on list */
555		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
556		/* Set VM service ID */
557		pcb->vm_srv_id = srv_id_template;
558		set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port);
559		/* Set host service ID and remote port */
560		pcb->host_srv_id = srv_id_template;
561		set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port);
562		hvs_addr_set(&pcb->remote_addr, raddr->hvs_port);
563
564		/* Change the socket state to SS_ISCONNECTING */
565		soisconnecting(so);
566	} else {
567		HVSOCK_DBG(HVSOCK_DBG_ERR,
568		    "%s: No local port available for auto bound\n",
569		    __func__);
570		error = EADDRINUSE;
571	}
572
573	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is ");
574	hvsock_print_guid(&pcb->vm_srv_id);
575	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is ");
576	hvsock_print_guid(&pcb->host_srv_id);
577
578out:
579	mtx_unlock(&hvs_trans_socks_mtx);
580
581	if (found_auto_bound_port == true)
582		 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id);
583
584	return (error);
585}
586
587int
588hvs_trans_disconnect(struct socket *so)
589{
590	struct hvs_pcb *pcb;
591
592	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
593	    "%s: HyperV Socket hvs_trans_disconnect called\n", __func__);
594
595	(void) hvs_trans_lock();
596	pcb = so2hvspcb(so);
597	if (pcb == NULL) {
598		hvs_trans_unlock();
599		return (EINVAL);
600	}
601
602	/* If socket is already disconnected, skip this */
603	if ((so->so_state & SS_ISDISCONNECTED) == 0)
604		soisdisconnecting(so);
605
606	hvs_trans_unlock();
607
608	return (0);
609}
610
611struct hvs_callback_arg {
612	struct uio *uio;
613	struct sockbuf *sb;
614};
615
616int
617hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr,
618    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
619{
620	struct hvs_pcb *pcb = so2hvspcb(so);
621	struct sockbuf *sb;
622	ssize_t orig_resid;
623	uint32_t canread, to_read;
624	int flags, error = 0;
625	struct hvs_callback_arg cbarg;
626
627	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
628	    "%s: HyperV Socket hvs_trans_soreceive called\n", __func__);
629
630	if (so->so_type != SOCK_STREAM)
631		return (EINVAL);
632	if (pcb == NULL)
633		return (EINVAL);
634
635	if (flagsp != NULL)
636		flags = *flagsp &~ MSG_EOR;
637	else
638		flags = 0;
639
640	if (flags & MSG_PEEK)
641		return (EOPNOTSUPP);
642
643	/* If no space to copy out anything */
644	if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ)
645		return (EINVAL);
646
647	orig_resid = uio->uio_resid;
648
649	/* Prevent other readers from entering the socket. */
650	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
651	if (error) {
652		HVSOCK_DBG(HVSOCK_DBG_ERR,
653		    "%s: soiolock returned error = %d\n", __func__, error);
654		return (error);
655	}
656
657	sb = &so->so_rcv;
658	SOCKBUF_LOCK(sb);
659
660	cbarg.uio = uio;
661	cbarg.sb = sb;
662	/*
663	 * If the socket is closing, there might still be some data
664	 * in rx br to read. However we need to make sure
665	 * the channel is still open.
666	 */
667	if ((sb->sb_state & SBS_CANTRCVMORE) &&
668	    (so->so_state & SS_ISDISCONNECTED)) {
669		/* Other thread already closed the channel */
670		error = EPIPE;
671		goto out;
672	}
673
674	while (true) {
675		while (uio->uio_resid > 0 &&
676		    (canread = hvsock_canread_check(pcb)) > 0) {
677			to_read = MIN(canread, uio->uio_resid);
678			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
679			    "%s: to_read = %u, skip = %u\n", __func__, to_read,
680			    (unsigned int)(sizeof(struct hvs_pkt_header) +
681			    pcb->recv_data_off));
682
683			error = vmbus_chan_recv_peek_call(pcb->chan, to_read,
684			    sizeof(struct hvs_pkt_header) + pcb->recv_data_off,
685			    hvsock_br_callback, (void *)&cbarg);
686			/*
687			 * It is possible socket is disconnected becasue
688			 * we released lock in hvsock_br_callback. So we
689			 * need to check the state to make sure it is not
690			 * disconnected.
691			 */
692			if (error || so->so_state & SS_ISDISCONNECTED) {
693				break;
694			}
695
696			pcb->recv_data_len -= to_read;
697			pcb->recv_data_off += to_read;
698		}
699
700		if (error)
701			break;
702
703		/* Abort if socket has reported problems. */
704		if (so->so_error) {
705			if (so->so_error == ESHUTDOWN &&
706			    orig_resid > uio->uio_resid) {
707				/*
708				 * Although we got a FIN, we also received
709				 * some data in this round. Delivery it
710				 * to user.
711				 */
712				error = 0;
713			} else {
714				if (so->so_error != ESHUTDOWN)
715					error = so->so_error;
716			}
717
718			break;
719		}
720
721		/* Cannot received more. */
722		if (sb->sb_state & SBS_CANTRCVMORE)
723			break;
724
725		/* We are done if buffer has been filled */
726		if (uio->uio_resid == 0)
727			break;
728
729		if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid)
730			break;
731
732		/* Buffer ring is empty and we shall not block */
733		if ((so->so_state & SS_NBIO) ||
734		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
735			if (orig_resid == uio->uio_resid) {
736				/* We have not read anything */
737				error = EAGAIN;
738			}
739			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
740			    "%s: non blocked read return, error %d.\n",
741			    __func__, error);
742			break;
743		}
744
745		/*
746		 * Wait and block until (more) data comes in.
747		 * Note: Drops the sockbuf lock during wait.
748		 */
749		error = sbwait(so, SO_RCV);
750
751		if (error)
752			break;
753
754		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
755		    "%s: wake up from sbwait, read available is %u\n",
756		    __func__, vmbus_chan_read_available(pcb->chan));
757	}
758
759out:
760	SOCKBUF_UNLOCK(sb);
761	SOCK_IO_RECV_UNLOCK(so);
762
763	/* We received a FIN in this call */
764	if (so->so_error == ESHUTDOWN) {
765		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
766			/* Send has already closed */
767			soisdisconnecting(so);
768		} else {
769			/* Just close the receive side */
770			socantrcvmore(so);
771		}
772	}
773
774	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
775	    "%s: returning error = %d, so_error = %d\n",
776	    __func__, error, so->so_error);
777
778	return (error);
779}
780
781int
782hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
783    struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td)
784{
785	struct hvs_pcb *pcb = so2hvspcb(so);
786	struct sockbuf *sb;
787	ssize_t orig_resid;
788	uint32_t canwrite, to_write;
789	int error = 0;
790
791	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
792	    "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n",
793	    __func__, uio->uio_resid);
794
795	if (so->so_type != SOCK_STREAM)
796		return (EINVAL);
797	if (pcb == NULL)
798		return (EINVAL);
799
800	/* If nothing to send */
801	if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE)
802		return (EINVAL);
803
804	orig_resid = uio->uio_resid;
805
806	/* Prevent other writers from entering the socket. */
807	error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
808	if (error) {
809		HVSOCK_DBG(HVSOCK_DBG_ERR,
810		    "%s: soiolocak returned error = %d\n", __func__, error);
811		return (error);
812	}
813
814	sb = &so->so_snd;
815	SOCKBUF_LOCK(sb);
816
817	if ((sb->sb_state & SBS_CANTSENDMORE) ||
818	    so->so_error == ESHUTDOWN) {
819		error = EPIPE;
820		goto out;
821	}
822
823	while (uio->uio_resid > 0) {
824		canwrite = hvsock_canwrite_check(pcb);
825		if (canwrite == 0) {
826			/* We have sent some data */
827			if (orig_resid > uio->uio_resid)
828				break;
829			/*
830			 * We have not sent any data and it is
831			 * non-blocked io
832			 */
833			if (so->so_state & SS_NBIO ||
834			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
835				error = EWOULDBLOCK;
836				break;
837			} else {
838				/*
839				 * We are here because there is no space on
840				 * send buffer ring. Signal the other side
841				 * to read and free more space.
842				 * Sleep wait until space avaiable to send
843				 * Note: Drops the sockbuf lock during wait.
844				 */
845				error = sbwait(so, SO_SND);
846
847				if (error)
848					break;
849
850				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
851				    "%s: wake up from sbwait, space avail on "
852				    "tx ring is %u\n",
853				    __func__,
854				    vmbus_chan_write_available(pcb->chan));
855
856				continue;
857			}
858		}
859		to_write = MIN(canwrite, uio->uio_resid);
860		to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ);
861
862		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
863		    "%s: canwrite is %u, to_write = %u\n", __func__,
864		    canwrite, to_write);
865		error = hvsock_send_data(pcb->chan, uio, to_write, sb);
866
867		if (error)
868			break;
869	}
870
871out:
872	SOCKBUF_UNLOCK(sb);
873	SOCK_IO_SEND_UNLOCK(so);
874
875	return (error);
876}
877
878int
879hvs_trans_peeraddr(struct socket *so, struct sockaddr *sa)
880{
881	struct hvs_pcb *pcb = so2hvspcb(so);
882
883	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
884	    "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__);
885
886	if (pcb == NULL)
887		return (EINVAL);
888
889	memcpy(sa, &pcb->remote_addr, pcb->remote_addr.sa_len);
890
891	return (0);
892}
893
894int
895hvs_trans_sockaddr(struct socket *so, struct sockaddr *sa)
896{
897	struct hvs_pcb *pcb = so2hvspcb(so);
898
899	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
900	    "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__);
901
902	if (pcb == NULL)
903		return (EINVAL);
904
905	memcpy(sa, &pcb->local_addr, pcb->local_addr.sa_len);
906
907	return (0);
908}
909
910void
911hvs_trans_close(struct socket *so)
912{
913	struct hvs_pcb *pcb;
914
915	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
916	    "%s: HyperV Socket hvs_trans_close called\n", __func__);
917
918	(void) hvs_trans_lock();
919	pcb = so2hvspcb(so);
920	if (!pcb) {
921		hvs_trans_unlock();
922		return;
923	}
924
925	if (so->so_state & SS_ISCONNECTED) {
926		/* Send a FIN to peer */
927		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
928		    "%s: hvs_trans_close sending a FIN to host\n", __func__);
929		(void) hvsock_send_data(pcb->chan, NULL, 0, NULL);
930	}
931
932	if (so->so_state &
933	    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
934		soisdisconnected(so);
935
936	pcb->chan = NULL;
937	pcb->so = NULL;
938
939	if (SOLISTENING(so)) {
940		mtx_lock(&hvs_trans_socks_mtx);
941		/* Remove from bound list */
942		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
943		mtx_unlock(&hvs_trans_socks_mtx);
944	}
945
946	hvs_trans_unlock();
947
948	return;
949}
950
951void
952hvs_trans_abort(struct socket *so)
953{
954	struct hvs_pcb *pcb = so2hvspcb(so);
955
956	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
957	    "%s: HyperV Socket hvs_trans_abort called\n", __func__);
958
959	(void) hvs_trans_lock();
960	if (pcb == NULL) {
961		hvs_trans_unlock();
962		return;
963	}
964
965	if (SOLISTENING(so)) {
966		mtx_lock(&hvs_trans_socks_mtx);
967		/* Remove from bound list */
968		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
969		mtx_unlock(&hvs_trans_socks_mtx);
970	}
971
972	if (so->so_state & SS_ISCONNECTED) {
973		(void) sodisconnect(so);
974	}
975	hvs_trans_unlock();
976
977	return;
978}
979
980int
981hvs_trans_shutdown(struct socket *so, enum shutdown_how how)
982{
983	struct hvs_pcb *pcb = so2hvspcb(so);
984
985	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
986	    "%s: HyperV Socket hvs_trans_shutdown called\n", __func__);
987
988	SOCK_LOCK(so);
989	if ((so->so_state &
990	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
991		SOCK_UNLOCK(so);
992		return (ENOTCONN);
993	}
994	SOCK_UNLOCK(so);
995
996	if (pcb == NULL)
997		return (EINVAL);
998
999	switch (how) {
1000	case SHUT_RD:
1001		socantrcvmore(so);
1002		break;
1003	case SHUT_RDWR:
1004		socantrcvmore(so);
1005		if (so->so_state & SS_ISCONNECTED) {
1006			/* Send a FIN to peer */
1007			SOCK_SENDBUF_LOCK(so);
1008			(void) hvsock_send_data(pcb->chan, NULL, 0,
1009			    &so->so_snd);
1010			SOCK_SENDBUF_UNLOCK(so);
1011			soisdisconnecting(so);
1012		}
1013		/* FALLTHROUGH */
1014	case SHUT_WR:
1015		socantsendmore(so);
1016	}
1017	wakeup(&so->so_timeo);
1018
1019	return (0);
1020}
1021
1022/* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is
1023 * <port> (see struct sockaddr_hvs).
1024 *
1025 * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV:
1026 * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user-
1027 * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with
1028 * the below sockaddr:
1029 *
1030 * struct SOCKADDR_HV
1031 * {
1032 *    ADDRESS_FAMILY Family;
1033 *    USHORT Reserved;
1034 *    GUID VmId;
1035 *    GUID ServiceId;
1036 * };
1037 * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via
1038 * VMBus, because here it's obvious the host and the VM can easily identify
1039 * each other. Though the VmID is useful on the host, especially in the case
1040 * of Windows container, FreeBSD VM doesn't need it at all.
1041 *
1042 * To be compatible with similar infrastructure in Linux VMs, we have
1043 * to limit the available GUID space of SOCKADDR_HV so that we can create
1044 * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID.
1045 * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is:
1046 *
1047 ****************************************************************************
1048 * The only valid Service GUIDs, from the perspectives of both the host and *
1049 * FreeBSD VM, that can be connected by the other end, must conform to this *
1050 * format: <port>-facb-11e6-bd58-64006a7986d3.                              *
1051 ****************************************************************************
1052 *
1053 * When we write apps on the host to connect(), the GUID ServiceID is used.
1054 * When we write apps in FreeBSD VM to connect(), we only need to specify the
1055 * port and the driver will form the GUID and use that to request the host.
1056 *
1057 * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the
1058 * auto-generated remote port for a connect request initiated by the host's
1059 * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the
1060 * FreeBSD guest.
1061 */
1062
1063/*
1064 * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before)
1065 * restricts HyperV socket ring buffer size to six 4K pages. Newer
1066 * HyperV hosts doen't have this limit.
1067 */
1068#define HVS_RINGBUF_RCV_SIZE	(PAGE_SIZE * 6)
1069#define HVS_RINGBUF_SND_SIZE	(PAGE_SIZE * 6)
1070#define HVS_RINGBUF_MAX_SIZE	(PAGE_SIZE * 64)
1071
1072struct hvsock_sc {
1073	device_t		dev;
1074	struct hvs_pcb		*pcb;
1075	struct vmbus_channel	*channel;
1076};
1077
1078static bool
1079hvsock_chan_readable(struct vmbus_channel *chan)
1080{
1081	uint32_t readable = vmbus_chan_read_available(chan);
1082
1083	return (readable >= HVSOCK_PKT_LEN(0));
1084}
1085
1086static void
1087hvsock_chan_cb(struct vmbus_channel *chan, void *context)
1088{
1089	struct hvs_pcb *pcb = (struct hvs_pcb *) context;
1090	struct socket *so;
1091	uint32_t canwrite;
1092
1093	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1094	    "%s: host send us a wakeup on rb data, pcb = %p\n",
1095	    __func__, pcb);
1096
1097	/*
1098	 * Check if the socket is still attached and valid.
1099	 * Here we know channel is still open. Need to make
1100	 * sure the socket has not been closed or freed.
1101	 */
1102	(void) hvs_trans_lock();
1103	so = hsvpcb2so(pcb);
1104
1105	if (pcb->chan != NULL && so != NULL) {
1106		/*
1107		 * Wake up reader if there are data to read.
1108		 */
1109		SOCKBUF_LOCK(&(so)->so_rcv);
1110
1111		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1112		    "%s: read available = %u\n", __func__,
1113		    vmbus_chan_read_available(pcb->chan));
1114
1115		if (hvsock_chan_readable(pcb->chan))
1116			sorwakeup_locked(so);
1117		else
1118			SOCKBUF_UNLOCK(&(so)->so_rcv);
1119
1120		/*
1121		 * Wake up sender if space becomes available to write.
1122		 */
1123		SOCKBUF_LOCK(&(so)->so_snd);
1124		canwrite = hvsock_canwrite_check(pcb);
1125
1126		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1127		    "%s: canwrite = %u\n", __func__, canwrite);
1128
1129		if (canwrite > 0) {
1130			sowwakeup_locked(so);
1131		} else {
1132			SOCKBUF_UNLOCK(&(so)->so_snd);
1133		}
1134	}
1135
1136	hvs_trans_unlock();
1137
1138	return;
1139}
1140
1141static int
1142hvsock_br_callback(void *datap, int cplen, void *cbarg)
1143{
1144	struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg;
1145	struct uio *uio = arg->uio;
1146	struct sockbuf *sb = arg->sb;
1147	int error = 0;
1148
1149	if (cbarg == NULL || datap == NULL)
1150		return (EINVAL);
1151
1152	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1153	    "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, "
1154	    "datap = %p\n",
1155	    __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br",
1156	    uio->uio_resid, cplen, datap);
1157
1158	if (sb)
1159		SOCKBUF_UNLOCK(sb);
1160
1161	error = uiomove(datap, cplen, uio);
1162
1163	if (sb)
1164		SOCKBUF_LOCK(sb);
1165
1166	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1167	    "%s: after uiomove, uio_resid = %zd, error = %d\n",
1168	    __func__, uio->uio_resid, error);
1169
1170	return (error);
1171}
1172
1173static int
1174hvsock_send_data(struct vmbus_channel *chan, struct uio *uio,
1175    uint32_t to_write, struct sockbuf *sb)
1176{
1177	struct hvs_pkt_header hvs_pkt;
1178	int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0;
1179	uint64_t pad = 0;
1180	struct iovec iov[3];
1181	struct hvs_callback_arg cbarg;
1182
1183	if (chan == NULL)
1184		return (ENOTCONN);
1185
1186	hlen = sizeof(struct vmbus_chanpkt_hdr);
1187	hvs_pkthlen = sizeof(struct hvs_pkt_header);
1188	hvs_pktlen = hvs_pkthlen + to_write;
1189	pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen);
1190
1191	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1192	    "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, "
1193	    "pad_pktlen = %u, data_len = %u\n",
1194	    __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write);
1195
1196	hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND;
1197	hvs_pkt.chan_pkt_hdr.cph_flags = 0;
1198	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen);
1199	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen);
1200	hvs_pkt.chan_pkt_hdr.cph_xactid = 0;
1201
1202	hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1;
1203	hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write;
1204
1205	cbarg.uio = uio;
1206	cbarg.sb = sb;
1207
1208	if (uio && to_write > 0) {
1209		iov[0].iov_base = &hvs_pkt;
1210		iov[0].iov_len = hvs_pkthlen;
1211		iov[1].iov_base = NULL;
1212		iov[1].iov_len = to_write;
1213		iov[2].iov_base = &pad;
1214		iov[2].iov_len = pad_pktlen - hvs_pktlen;
1215
1216		error = vmbus_chan_iov_send(chan, iov, 3,
1217		    hvsock_br_callback, &cbarg);
1218	} else {
1219		if (to_write == 0) {
1220			iov[0].iov_base = &hvs_pkt;
1221			iov[0].iov_len = hvs_pkthlen;
1222			iov[1].iov_base = &pad;
1223			iov[1].iov_len = pad_pktlen - hvs_pktlen;
1224			error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL);
1225		}
1226	}
1227
1228	if (error) {
1229		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1230		    "%s: error = %d\n", __func__, error);
1231	}
1232
1233	return (error);
1234}
1235
1236/*
1237 * Check if we have data on current ring buffer to read
1238 * or not. If not, advance the ring buffer read index to
1239 * next packet. Update the recev_data_len and recev_data_off
1240 * to new value.
1241 * Return the number of bytes can read.
1242 */
1243static uint32_t
1244hvsock_canread_check(struct hvs_pcb *pcb)
1245{
1246	uint32_t advance;
1247	uint32_t tlen, hlen, dlen;
1248	uint32_t bytes_canread = 0;
1249	int error;
1250
1251	if (pcb == NULL || pcb->chan == NULL) {
1252		pcb->so->so_error = EIO;
1253		return (0);
1254	}
1255
1256	/* Still have data not read yet on current packet */
1257	if (pcb->recv_data_len > 0)
1258		return (pcb->recv_data_len);
1259
1260	if (pcb->rb_init)
1261		advance =
1262		    VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1263	else
1264		advance = 0;
1265
1266	bytes_canread = vmbus_chan_read_available(pcb->chan);
1267
1268	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1269	    "%s: bytes_canread on br = %u, advance = %u\n",
1270	    __func__, bytes_canread, advance);
1271
1272	if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) {
1273		/*
1274		 * Nothing to read. Need to advance the rindex before
1275		 * calling sbwait, so host knows to wake us up when data
1276		 * is available to read on rb.
1277		 */
1278		error = vmbus_chan_recv_idxadv(pcb->chan, advance);
1279		if (error) {
1280			HVSOCK_DBG(HVSOCK_DBG_ERR,
1281			    "%s: after calling vmbus_chan_recv_idxadv, "
1282			    "got error = %d\n",  __func__, error);
1283			return (0);
1284		} else {
1285			pcb->rb_init = false;
1286			pcb->recv_data_len = 0;
1287			pcb->recv_data_off = 0;
1288			bytes_canread = vmbus_chan_read_available(pcb->chan);
1289
1290			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1291			    "%s: advanced %u bytes, "
1292			    " bytes_canread on br now = %u\n",
1293			    __func__, advance, bytes_canread);
1294
1295			if (bytes_canread == 0)
1296				return (0);
1297			else
1298				advance = 0;
1299		}
1300	}
1301
1302	if (bytes_canread <
1303	    advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t)))
1304		return (0);
1305
1306	error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt,
1307	    sizeof(struct hvs_pkt_header), advance);
1308
1309	/* Don't have anything to read */
1310	if (error) {
1311		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1312		    "%s: after calling vmbus_chan_recv_peek, got error = %d\n",
1313		    __func__, error);
1314		return (0);
1315	}
1316
1317	/*
1318	 * We just read in a new packet header. Do some sanity checks.
1319	 */
1320	tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1321	hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen);
1322	dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size;
1323	if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) ||
1324	    __predict_false(hlen > tlen) ||
1325	    __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) {
1326		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1327		    "invalid tlen(%u), hlen(%u) or dlen(%u)\n",
1328		    tlen, hlen, dlen);
1329		pcb->so->so_error = EIO;
1330		return (0);
1331	}
1332	if (pcb->rb_init == false)
1333		pcb->rb_init = true;
1334
1335	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1336	    "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n",
1337	    tlen, hlen, dlen);
1338
1339	/* The other side has sent a close FIN */
1340	if (dlen == 0) {
1341		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1342		    "%s: Received FIN from other side\n", __func__);
1343		/* inform the caller by seting so_error to ESHUTDOWN */
1344		pcb->so->so_error = ESHUTDOWN;
1345	}
1346
1347	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1348	    "%s: canread on receive ring is %u \n", __func__, dlen);
1349
1350	pcb->recv_data_len = dlen;
1351	pcb->recv_data_off = 0;
1352
1353	return (pcb->recv_data_len);
1354}
1355
1356static uint32_t
1357hvsock_canwrite_check(struct hvs_pcb *pcb)
1358{
1359	uint32_t writeable;
1360	uint32_t ret;
1361
1362	if (pcb == NULL || pcb->chan == NULL)
1363		return (0);
1364
1365	writeable = vmbus_chan_write_available(pcb->chan);
1366
1367	/*
1368	 * We must always reserve a 0-length-payload packet for the FIN.
1369	 */
1370	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1371	    "%s: writeable is %u, should be greater than %ju\n",
1372	    __func__, writeable,
1373	    (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)));
1374
1375	if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) {
1376		/*
1377		 * The Tx ring seems full.
1378		 */
1379		return (0);
1380	}
1381
1382	ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0);
1383
1384	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1385	    "%s: available size is %u\n", __func__, rounddown2(ret, 8));
1386
1387	return (rounddown2(ret, 8));
1388}
1389
1390static void
1391hvsock_set_chan_pending_send_size(struct vmbus_channel *chan)
1392{
1393	vmbus_chan_set_pending_send_size(chan,
1394	    HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ));
1395}
1396
1397static int
1398hvsock_open_channel(struct vmbus_channel *chan, struct socket *so)
1399{
1400	unsigned int rcvbuf, sndbuf;
1401	struct hvs_pcb *pcb = so2hvspcb(so);
1402	int ret;
1403
1404	if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) {
1405		sndbuf = HVS_RINGBUF_SND_SIZE;
1406		rcvbuf = HVS_RINGBUF_RCV_SIZE;
1407	} else {
1408		sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE);
1409		sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE);
1410		sndbuf = rounddown2(sndbuf, PAGE_SIZE);
1411		rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE);
1412		rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE);
1413		rcvbuf = rounddown2(rcvbuf, PAGE_SIZE);
1414	}
1415
1416	/*
1417	 * Can only read whatever user provided size of data
1418	 * from ring buffer. Turn off batched reading.
1419	 */
1420	vmbus_chan_set_readbatch(chan, false);
1421
1422	ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0,
1423	    hvsock_chan_cb, pcb);
1424
1425	if (ret != 0) {
1426		HVSOCK_DBG(HVSOCK_DBG_ERR,
1427		    "%s: failed to open hvsock channel, sndbuf = %u, "
1428		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1429	} else {
1430		HVSOCK_DBG(HVSOCK_DBG_INFO,
1431		    "%s: hvsock channel opened, sndbuf = %u, i"
1432		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1433		/*
1434		 * Se the pending send size so to receive wakeup
1435		 * signals from host when there is enough space on
1436		 * rx buffer ring to write.
1437		 */
1438		hvsock_set_chan_pending_send_size(chan);
1439	}
1440
1441	return ret;
1442}
1443
1444/*
1445 * Guest is listening passively on the socket. Open channel and
1446 * create a new socket for the conneciton.
1447 */
1448static void
1449hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so,
1450    struct hvsock_sc *sc)
1451{
1452	struct socket *new_so;
1453	struct hvs_pcb *new_pcb, *pcb;
1454	int error;
1455
1456	/* Do nothing if socket is not listening */
1457	if (!SOLISTENING(so)) {
1458		HVSOCK_DBG(HVSOCK_DBG_ERR,
1459		    "%s: socket is not a listening one\n", __func__);
1460		return;
1461	}
1462
1463	/*
1464	 * Create a new socket. This will call pru_attach to complete
1465	 * the socket initialization and put the new socket onto
1466	 * listening socket's sol_incomp list, waiting to be promoted
1467	 * to sol_comp list.
1468	 * The new socket created has ref count 0. There is no other
1469	 * thread that changes the state of this new one at the
1470	 * moment, so we don't need to hold its lock while opening
1471	 * channel and filling out its pcb information.
1472	 */
1473	new_so = sonewconn(so, 0);
1474	if (!new_so)
1475		HVSOCK_DBG(HVSOCK_DBG_ERR,
1476		    "%s: creating new socket failed\n", __func__);
1477
1478	/*
1479	 * Now open the vmbus channel. If it fails, the socket will be
1480	 * on the listening socket's sol_incomp queue until it is
1481	 * replaced and aborted.
1482	 */
1483	error = hvsock_open_channel(chan, new_so);
1484	if (error) {
1485		new_so->so_error = error;
1486		return;
1487	}
1488
1489	pcb = so->so_pcb;
1490	new_pcb = new_so->so_pcb;
1491
1492	hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port);
1493	/* Remote port is unknown to guest in this type of conneciton */
1494	hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN);
1495	new_pcb->chan = chan;
1496	new_pcb->recv_data_len = 0;
1497	new_pcb->recv_data_off = 0;
1498	new_pcb->rb_init = false;
1499
1500	new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan);
1501	new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan);
1502
1503	hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED);
1504
1505	sc->pcb = new_pcb;
1506
1507	/*
1508	 * Change the socket state to SS_ISCONNECTED. This will promote
1509	 * the socket to sol_comp queue and wake up the thread which
1510	 * is accepting connection.
1511	 */
1512	soisconnected(new_so);
1513}
1514
1515
1516/*
1517 * Guest is actively connecting to host.
1518 */
1519static void
1520hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so)
1521{
1522	struct hvs_pcb *pcb;
1523	int error;
1524
1525	error = hvsock_open_channel(chan, so);
1526	if (error) {
1527		so->so_error = error;
1528		return;
1529	}
1530
1531	pcb = so->so_pcb;
1532	pcb->chan = chan;
1533	pcb->recv_data_len = 0;
1534	pcb->recv_data_off = 0;
1535	pcb->rb_init = false;
1536
1537	mtx_lock(&hvs_trans_socks_mtx);
1538	__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
1539	__hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED);
1540	mtx_unlock(&hvs_trans_socks_mtx);
1541
1542	/*
1543	 * Change the socket state to SS_ISCONNECTED. This will wake up
1544	 * the thread sleeping in connect call.
1545	 */
1546	soisconnected(so);
1547}
1548
1549static void
1550hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc)
1551{
1552	struct hyperv_guid *inst_guid, *type_guid;
1553	bool conn_from_host;
1554	struct sockaddr_hvs addr;
1555	struct socket *so;
1556	struct hvs_pcb *pcb;
1557
1558	type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan);
1559	inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan);
1560	conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan);
1561
1562	HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is ");
1563	hvsock_print_guid(type_guid);
1564	HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is ");
1565	hvsock_print_guid(inst_guid);
1566	HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n",
1567	    (conn_from_host == true ) ? "from" : "to");
1568
1569	/*
1570	 * The listening port should be in [0, MAX_LISTEN_PORT]
1571	 */
1572	if (!is_valid_srv_id(type_guid))
1573		return;
1574
1575	/*
1576	 * There should be a bound socket already created no matter
1577	 * it is a passive or active connection.
1578	 * For host initiated connection (passive on guest side),
1579	 * the  type_guid contains the port which guest is bound and
1580	 * listening.
1581	 * For the guest initiated connection (active on guest side),
1582	 * the inst_guid contains the port that guest has auto bound
1583	 * to.
1584	 */
1585	hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid);
1586	so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND);
1587	if (!so) {
1588		HVSOCK_DBG(HVSOCK_DBG_ERR,
1589		    "%s: no bound socket found for port %u\n",
1590		    __func__, addr.hvs_port);
1591		return;
1592	}
1593
1594	if (conn_from_host) {
1595		hvsock_open_conn_passive(chan, so, sc);
1596	} else {
1597		(void) hvs_trans_lock();
1598		pcb = so->so_pcb;
1599		if (pcb && pcb->so) {
1600			sc->pcb = so2hvspcb(so);
1601			hvsock_open_conn_active(chan, so);
1602		} else {
1603			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1604			    "%s: channel detached before open\n", __func__);
1605		}
1606		hvs_trans_unlock();
1607	}
1608
1609}
1610
1611static int
1612hvsock_probe(device_t dev)
1613{
1614	struct vmbus_channel *channel = vmbus_get_channel(dev);
1615
1616	if (!channel || !vmbus_chan_is_hvs(channel)) {
1617		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1618		    "hvsock_probe called but not a hvsock channel id %u\n",
1619		    vmbus_chan_id(channel));
1620
1621		return ENXIO;
1622	} else {
1623		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1624		    "hvsock_probe got a hvsock channel id %u\n",
1625		    vmbus_chan_id(channel));
1626
1627		return BUS_PROBE_DEFAULT;
1628	}
1629}
1630
1631static int
1632hvsock_attach(device_t dev)
1633{
1634	struct vmbus_channel *channel = vmbus_get_channel(dev);
1635	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1636
1637	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n");
1638
1639	hvsock_open_connection(channel, sc);
1640
1641	/*
1642	 * Always return success. On error the host will rescind the device
1643	 * in 30 seconds and we can do cleanup at that time in
1644	 * vmbus_chan_msgproc_chrescind().
1645	 */
1646	return (0);
1647}
1648
1649static int
1650hvsock_detach(device_t dev)
1651{
1652	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1653	struct socket *so;
1654	int retry;
1655
1656	if (bootverbose)
1657		device_printf(dev, "hvsock_detach called.\n");
1658
1659	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n");
1660
1661	if (sc->pcb != NULL) {
1662		(void) hvs_trans_lock();
1663
1664		so = hsvpcb2so(sc->pcb);
1665		if (so) {
1666			/* Close the connection */
1667			if (so->so_state &
1668			    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
1669				soisdisconnected(so);
1670		}
1671
1672		mtx_lock(&hvs_trans_socks_mtx);
1673		__hvs_remove_pcb_from_list(sc->pcb,
1674		    HVS_LIST_BOUND | HVS_LIST_CONNECTED);
1675		mtx_unlock(&hvs_trans_socks_mtx);
1676
1677		/*
1678		 * Close channel while no reader and sender are working
1679		 * on the buffer rings.
1680		 */
1681		if (so) {
1682			retry = 0;
1683			while (SOCK_IO_RECV_LOCK(so, 0) == EWOULDBLOCK) {
1684				/*
1685				 * Someone is reading, rx br is busy
1686				 */
1687				soisdisconnected(so);
1688				DELAY(500);
1689				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1690				    "waiting for rx reader to exit, "
1691				    "retry = %d\n", retry++);
1692			}
1693			retry = 0;
1694			while (SOCK_IO_SEND_LOCK(so, 0) == EWOULDBLOCK) {
1695				/*
1696				 * Someone is sending, tx br is busy
1697				 */
1698				soisdisconnected(so);
1699				DELAY(500);
1700				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1701				    "waiting for tx sender to exit, "
1702				    "retry = %d\n", retry++);
1703			}
1704		}
1705
1706
1707		bzero(sc->pcb, sizeof(struct hvs_pcb));
1708		free(sc->pcb, M_HVSOCK);
1709		sc->pcb = NULL;
1710
1711		if (so) {
1712			SOCK_IO_RECV_UNLOCK(so);
1713			SOCK_IO_SEND_UNLOCK(so);
1714			so->so_pcb = NULL;
1715		}
1716
1717		hvs_trans_unlock();
1718	}
1719
1720	vmbus_chan_close(vmbus_get_channel(dev));
1721
1722	return (0);
1723}
1724
1725static device_method_t hvsock_methods[] = {
1726	/* Device interface */
1727	DEVMETHOD(device_probe, hvsock_probe),
1728	DEVMETHOD(device_attach, hvsock_attach),
1729	DEVMETHOD(device_detach, hvsock_detach),
1730	DEVMETHOD_END
1731};
1732
1733static driver_t hvsock_driver = {
1734	"hv_sock",
1735	hvsock_methods,
1736	sizeof(struct hvsock_sc)
1737};
1738
1739DRIVER_MODULE(hvsock, vmbus, hvsock_driver, NULL, NULL);
1740MODULE_VERSION(hvsock, 1);
1741MODULE_DEPEND(hvsock, vmbus, 1, 1, 1);
1742