hv_storvsc_drv_freebsd.c revision 315812
1/*-
2 * Copyright (c) 2009-2012,2016 Microsoft Corp.
3 * Copyright (c) 2012 NetApp Inc.
4 * Copyright (c) 2012 Citrix Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice unmodified, this list of conditions, and the following
12 *    disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/**
30 * StorVSC driver for Hyper-V.  This driver presents a SCSI HBA interface
31 * to the Comman Access Method (CAM) layer.  CAM control blocks (CCBs) are
32 * converted into VSCSI protocol messages which are delivered to the parent
33 * partition StorVSP driver over the Hyper-V VMBUS.
34 */
35#include <sys/cdefs.h>
36__FBSDID("$FreeBSD: stable/11/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c 315812 2017-03-23 06:40:20Z mav $");
37
38#include <sys/param.h>
39#include <sys/proc.h>
40#include <sys/condvar.h>
41#include <sys/time.h>
42#include <sys/systm.h>
43#include <sys/sysctl.h>
44#include <sys/sockio.h>
45#include <sys/mbuf.h>
46#include <sys/malloc.h>
47#include <sys/module.h>
48#include <sys/kernel.h>
49#include <sys/queue.h>
50#include <sys/lock.h>
51#include <sys/sx.h>
52#include <sys/taskqueue.h>
53#include <sys/bus.h>
54#include <sys/mutex.h>
55#include <sys/callout.h>
56#include <sys/smp.h>
57#include <vm/vm.h>
58#include <vm/pmap.h>
59#include <vm/uma.h>
60#include <sys/lock.h>
61#include <sys/sema.h>
62#include <sys/sglist.h>
63#include <sys/eventhandler.h>
64#include <machine/bus.h>
65#include <sys/bus_dma.h>
66
67#include <cam/cam.h>
68#include <cam/cam_ccb.h>
69#include <cam/cam_periph.h>
70#include <cam/cam_sim.h>
71#include <cam/cam_xpt_sim.h>
72#include <cam/cam_xpt_internal.h>
73#include <cam/cam_debug.h>
74#include <cam/scsi/scsi_all.h>
75#include <cam/scsi/scsi_message.h>
76
77#include <dev/hyperv/include/hyperv.h>
78#include <dev/hyperv/include/vmbus.h>
79#include "hv_vstorage.h"
80#include "vmbus_if.h"
81
82#define STORVSC_MAX_LUNS_PER_TARGET	(64)
83#define STORVSC_MAX_IO_REQUESTS		(STORVSC_MAX_LUNS_PER_TARGET * 2)
84#define BLKVSC_MAX_IDE_DISKS_PER_TARGET	(1)
85#define BLKVSC_MAX_IO_REQUESTS		STORVSC_MAX_IO_REQUESTS
86#define STORVSC_MAX_TARGETS		(2)
87
88#define VSTOR_PKT_SIZE	(sizeof(struct vstor_packet) - vmscsi_size_delta)
89
90/*
91 * 33 segments are needed to allow 128KB maxio, in case the data
92 * in the first page is _not_ PAGE_SIZE aligned, e.g.
93 *
94 *     |<----------- 128KB ----------->|
95 *     |                               |
96 *  0  2K 4K    8K   16K   124K  128K  130K
97 *  |  |  |     |     |       |     |  |
98 *  +--+--+-----+-----+.......+-----+--+--+
99 *  |  |  |     |     |       |     |  |  | DATA
100 *  |  |  |     |     |       |     |  |  |
101 *  +--+--+-----+-----+.......------+--+--+
102 *     |  |                         |  |
103 *     | 1|            31           | 1| ...... # of segments
104 */
105#define STORVSC_DATA_SEGCNT_MAX		33
106#define STORVSC_DATA_SEGSZ_MAX		PAGE_SIZE
107#define STORVSC_DATA_SIZE_MAX		\
108	((STORVSC_DATA_SEGCNT_MAX - 1) * STORVSC_DATA_SEGSZ_MAX)
109
110struct storvsc_softc;
111
112struct hv_sgl_node {
113	LIST_ENTRY(hv_sgl_node) link;
114	struct sglist *sgl_data;
115};
116
117struct hv_sgl_page_pool{
118	LIST_HEAD(, hv_sgl_node) in_use_sgl_list;
119	LIST_HEAD(, hv_sgl_node) free_sgl_list;
120	boolean_t                is_init;
121} g_hv_sgl_page_pool;
122
123enum storvsc_request_type {
124	WRITE_TYPE,
125	READ_TYPE,
126	UNKNOWN_TYPE
127};
128
129SYSCTL_NODE(_hw, OID_AUTO, storvsc, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
130	"Hyper-V storage interface");
131
132static u_int hv_storvsc_use_win8ext_flags = 1;
133SYSCTL_UINT(_hw_storvsc, OID_AUTO, use_win8ext_flags, CTLFLAG_RW,
134	&hv_storvsc_use_win8ext_flags, 0,
135	"Use win8 extension flags or not");
136
137static u_int hv_storvsc_use_pim_unmapped = 1;
138SYSCTL_UINT(_hw_storvsc, OID_AUTO, use_pim_unmapped, CTLFLAG_RDTUN,
139	&hv_storvsc_use_pim_unmapped, 0,
140	"Optimize storvsc by using unmapped I/O");
141
142static u_int hv_storvsc_ringbuffer_size = (64 * PAGE_SIZE);
143SYSCTL_UINT(_hw_storvsc, OID_AUTO, ringbuffer_size, CTLFLAG_RDTUN,
144	&hv_storvsc_ringbuffer_size, 0, "Hyper-V storage ringbuffer size");
145
146static u_int hv_storvsc_max_io = 512;
147SYSCTL_UINT(_hw_storvsc, OID_AUTO, max_io, CTLFLAG_RDTUN,
148	&hv_storvsc_max_io, 0, "Hyper-V storage max io limit");
149
150static int hv_storvsc_chan_cnt = 0;
151SYSCTL_INT(_hw_storvsc, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
152	&hv_storvsc_chan_cnt, 0, "# of channels to use");
153
154#define STORVSC_MAX_IO						\
155	vmbus_chan_prplist_nelem(hv_storvsc_ringbuffer_size,	\
156	   STORVSC_DATA_SEGCNT_MAX, VSTOR_PKT_SIZE)
157
158struct hv_storvsc_sysctl {
159	u_long		data_bio_cnt;
160	u_long		data_vaddr_cnt;
161	u_long		data_sg_cnt;
162	u_long		chan_send_cnt[MAXCPU];
163};
164
165struct storvsc_gpa_range {
166	struct vmbus_gpa_range	gpa_range;
167	uint64_t		gpa_page[STORVSC_DATA_SEGCNT_MAX];
168} __packed;
169
170struct hv_storvsc_request {
171	LIST_ENTRY(hv_storvsc_request)	link;
172	struct vstor_packet		vstor_packet;
173	int				prp_cnt;
174	struct storvsc_gpa_range	prp_list;
175	void				*sense_data;
176	uint8_t				sense_info_len;
177	uint8_t				retries;
178	union ccb			*ccb;
179	struct storvsc_softc		*softc;
180	struct callout			callout;
181	struct sema			synch_sema; /*Synchronize the request/response if needed */
182	struct sglist			*bounce_sgl;
183	unsigned int			bounce_sgl_count;
184	uint64_t			not_aligned_seg_bits;
185	bus_dmamap_t			data_dmap;
186};
187
188struct storvsc_softc {
189	struct vmbus_channel		*hs_chan;
190	LIST_HEAD(, hv_storvsc_request)	hs_free_list;
191	struct mtx			hs_lock;
192	struct storvsc_driver_props	*hs_drv_props;
193	int 				hs_unit;
194	uint32_t			hs_frozen;
195	struct cam_sim			*hs_sim;
196	struct cam_path 		*hs_path;
197	uint32_t			hs_num_out_reqs;
198	boolean_t			hs_destroy;
199	boolean_t			hs_drain_notify;
200	struct sema 			hs_drain_sema;
201	struct hv_storvsc_request	hs_init_req;
202	struct hv_storvsc_request	hs_reset_req;
203	device_t			hs_dev;
204	bus_dma_tag_t			storvsc_req_dtag;
205	struct hv_storvsc_sysctl	sysctl_data;
206	uint32_t			hs_nchan;
207	struct vmbus_channel		*hs_sel_chan[MAXCPU];
208};
209
210static eventhandler_tag storvsc_handler_tag;
211/*
212 * The size of the vmscsi_request has changed in win8. The
213 * additional size is for the newly added elements in the
214 * structure. These elements are valid only when we are talking
215 * to a win8 host.
216 * Track the correct size we need to apply.
217 */
218static int vmscsi_size_delta = sizeof(struct vmscsi_win8_extension);
219
220/**
221 * HyperV storvsc timeout testing cases:
222 * a. IO returned after first timeout;
223 * b. IO returned after second timeout and queue freeze;
224 * c. IO returned while timer handler is running
225 * The first can be tested by "sg_senddiag -vv /dev/daX",
226 * and the second and third can be done by
227 * "sg_wr_mode -v -p 08 -c 0,1a -m 0,ff /dev/daX".
228 */
229#define HVS_TIMEOUT_TEST 0
230
231/*
232 * Bus/adapter reset functionality on the Hyper-V host is
233 * buggy and it will be disabled until
234 * it can be further tested.
235 */
236#define HVS_HOST_RESET 0
237
238struct storvsc_driver_props {
239	char		*drv_name;
240	char		*drv_desc;
241	uint8_t		drv_max_luns_per_target;
242	uint32_t	drv_max_ios_per_target;
243	uint32_t	drv_ringbuffer_size;
244};
245
246enum hv_storage_type {
247	DRIVER_BLKVSC,
248	DRIVER_STORVSC,
249	DRIVER_UNKNOWN
250};
251
252#define HS_MAX_ADAPTERS 10
253
254#define HV_STORAGE_SUPPORTS_MULTI_CHANNEL 0x1
255
256/* {ba6163d9-04a1-4d29-b605-72e2ffb1dc7f} */
257static const struct hyperv_guid gStorVscDeviceType={
258	.hv_guid = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d,
259		 0xb6, 0x05, 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f}
260};
261
262/* {32412632-86cb-44a2-9b5c-50d1417354f5} */
263static const struct hyperv_guid gBlkVscDeviceType={
264	.hv_guid = {0x32, 0x26, 0x41, 0x32, 0xcb, 0x86, 0xa2, 0x44,
265		 0x9b, 0x5c, 0x50, 0xd1, 0x41, 0x73, 0x54, 0xf5}
266};
267
268static struct storvsc_driver_props g_drv_props_table[] = {
269	{"blkvsc", "Hyper-V IDE",
270	 BLKVSC_MAX_IDE_DISKS_PER_TARGET, BLKVSC_MAX_IO_REQUESTS,
271	 20*PAGE_SIZE},
272	{"storvsc", "Hyper-V SCSI",
273	 STORVSC_MAX_LUNS_PER_TARGET, STORVSC_MAX_IO_REQUESTS,
274	 20*PAGE_SIZE}
275};
276
277/*
278 * Sense buffer size changed in win8; have a run-time
279 * variable to track the size we should use.
280 */
281static int sense_buffer_size = PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE;
282
283/*
284 * The storage protocol version is determined during the
285 * initial exchange with the host.  It will indicate which
286 * storage functionality is available in the host.
287*/
288static int vmstor_proto_version;
289
290struct vmstor_proto {
291        int proto_version;
292        int sense_buffer_size;
293        int vmscsi_size_delta;
294};
295
296static const struct vmstor_proto vmstor_proto_list[] = {
297        {
298                VMSTOR_PROTOCOL_VERSION_WIN10,
299                POST_WIN7_STORVSC_SENSE_BUFFER_SIZE,
300                0
301        },
302        {
303                VMSTOR_PROTOCOL_VERSION_WIN8_1,
304                POST_WIN7_STORVSC_SENSE_BUFFER_SIZE,
305                0
306        },
307        {
308                VMSTOR_PROTOCOL_VERSION_WIN8,
309                POST_WIN7_STORVSC_SENSE_BUFFER_SIZE,
310                0
311        },
312        {
313                VMSTOR_PROTOCOL_VERSION_WIN7,
314                PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE,
315                sizeof(struct vmscsi_win8_extension),
316        },
317        {
318                VMSTOR_PROTOCOL_VERSION_WIN6,
319                PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE,
320                sizeof(struct vmscsi_win8_extension),
321        }
322};
323
324/* static functions */
325static int storvsc_probe(device_t dev);
326static int storvsc_attach(device_t dev);
327static int storvsc_detach(device_t dev);
328static void storvsc_poll(struct cam_sim * sim);
329static void storvsc_action(struct cam_sim * sim, union ccb * ccb);
330static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp);
331static void storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp);
332static enum hv_storage_type storvsc_get_storage_type(device_t dev);
333static void hv_storvsc_rescan_target(struct storvsc_softc *sc);
334static void hv_storvsc_on_channel_callback(struct vmbus_channel *chan, void *xsc);
335static void hv_storvsc_on_iocompletion( struct storvsc_softc *sc,
336					struct vstor_packet *vstor_packet,
337					struct hv_storvsc_request *request);
338static int hv_storvsc_connect_vsp(struct storvsc_softc *);
339static void storvsc_io_done(struct hv_storvsc_request *reqp);
340static void storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl,
341				bus_dma_segment_t *orig_sgl,
342				unsigned int orig_sgl_count,
343				uint64_t seg_bits);
344void storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl,
345				unsigned int dest_sgl_count,
346				struct sglist* src_sgl,
347				uint64_t seg_bits);
348
349static device_method_t storvsc_methods[] = {
350	/* Device interface */
351	DEVMETHOD(device_probe,		storvsc_probe),
352	DEVMETHOD(device_attach,	storvsc_attach),
353	DEVMETHOD(device_detach,	storvsc_detach),
354	DEVMETHOD(device_shutdown,      bus_generic_shutdown),
355	DEVMETHOD_END
356};
357
358static driver_t storvsc_driver = {
359	"storvsc", storvsc_methods, sizeof(struct storvsc_softc),
360};
361
362static devclass_t storvsc_devclass;
363DRIVER_MODULE(storvsc, vmbus, storvsc_driver, storvsc_devclass, 0, 0);
364MODULE_VERSION(storvsc, 1);
365MODULE_DEPEND(storvsc, vmbus, 1, 1, 1);
366
367static void
368storvsc_subchan_attach(struct storvsc_softc *sc,
369    struct vmbus_channel *new_channel)
370{
371	struct vmstor_chan_props props;
372	int ret = 0;
373
374	memset(&props, 0, sizeof(props));
375
376	vmbus_chan_cpu_rr(new_channel);
377	ret = vmbus_chan_open(new_channel,
378	    sc->hs_drv_props->drv_ringbuffer_size,
379  	    sc->hs_drv_props->drv_ringbuffer_size,
380	    (void *)&props,
381	    sizeof(struct vmstor_chan_props),
382	    hv_storvsc_on_channel_callback, sc);
383}
384
385/**
386 * @brief Send multi-channel creation request to host
387 *
388 * @param device  a Hyper-V device pointer
389 * @param max_chans  the max channels supported by vmbus
390 */
391static void
392storvsc_send_multichannel_request(struct storvsc_softc *sc, int max_subch)
393{
394	struct vmbus_channel **subchan;
395	struct hv_storvsc_request *request;
396	struct vstor_packet *vstor_packet;
397	int request_subch;
398	int ret, i;
399
400	/* get sub-channel count that need to create */
401	request_subch = MIN(max_subch, mp_ncpus - 1);
402
403	request = &sc->hs_init_req;
404
405	/* request the host to create multi-channel */
406	memset(request, 0, sizeof(struct hv_storvsc_request));
407
408	sema_init(&request->synch_sema, 0, ("stor_synch_sema"));
409
410	vstor_packet = &request->vstor_packet;
411
412	vstor_packet->operation = VSTOR_OPERATION_CREATE_MULTI_CHANNELS;
413	vstor_packet->flags = REQUEST_COMPLETION_FLAG;
414	vstor_packet->u.multi_channels_cnt = request_subch;
415
416	ret = vmbus_chan_send(sc->hs_chan,
417	    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
418	    vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
419
420	sema_wait(&request->synch_sema);
421
422	if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
423	    vstor_packet->status != 0) {
424		printf("Storvsc_error: create multi-channel invalid operation "
425		    "(%d) or statue (%u)\n",
426		    vstor_packet->operation, vstor_packet->status);
427		return;
428	}
429
430	/* Update channel count */
431	sc->hs_nchan = request_subch + 1;
432
433	/* Wait for sub-channels setup to complete. */
434	subchan = vmbus_subchan_get(sc->hs_chan, request_subch);
435
436	/* Attach the sub-channels. */
437	for (i = 0; i < request_subch; ++i)
438		storvsc_subchan_attach(sc, subchan[i]);
439
440	/* Release the sub-channels. */
441	vmbus_subchan_rel(subchan, request_subch);
442
443	if (bootverbose)
444		printf("Storvsc create multi-channel success!\n");
445}
446
447/**
448 * @brief initialize channel connection to parent partition
449 *
450 * @param dev  a Hyper-V device pointer
451 * @returns  0 on success, non-zero error on failure
452 */
453static int
454hv_storvsc_channel_init(struct storvsc_softc *sc)
455{
456	int ret = 0, i;
457	struct hv_storvsc_request *request;
458	struct vstor_packet *vstor_packet;
459	uint16_t max_subch;
460	boolean_t support_multichannel;
461	uint32_t version;
462
463	max_subch = 0;
464	support_multichannel = FALSE;
465
466	request = &sc->hs_init_req;
467	memset(request, 0, sizeof(struct hv_storvsc_request));
468	vstor_packet = &request->vstor_packet;
469	request->softc = sc;
470
471	/**
472	 * Initiate the vsc/vsp initialization protocol on the open channel
473	 */
474	sema_init(&request->synch_sema, 0, ("stor_synch_sema"));
475
476	vstor_packet->operation = VSTOR_OPERATION_BEGININITIALIZATION;
477	vstor_packet->flags = REQUEST_COMPLETION_FLAG;
478
479
480	ret = vmbus_chan_send(sc->hs_chan,
481	    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
482	    vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
483
484	if (ret != 0)
485		goto cleanup;
486
487	sema_wait(&request->synch_sema);
488
489	if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
490		vstor_packet->status != 0) {
491		goto cleanup;
492	}
493
494	for (i = 0; i < nitems(vmstor_proto_list); i++) {
495		/* reuse the packet for version range supported */
496
497		memset(vstor_packet, 0, sizeof(struct vstor_packet));
498		vstor_packet->operation = VSTOR_OPERATION_QUERYPROTOCOLVERSION;
499		vstor_packet->flags = REQUEST_COMPLETION_FLAG;
500
501		vstor_packet->u.version.major_minor =
502			vmstor_proto_list[i].proto_version;
503
504		/* revision is only significant for Windows guests */
505		vstor_packet->u.version.revision = 0;
506
507		ret = vmbus_chan_send(sc->hs_chan,
508		    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
509		    vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
510
511		if (ret != 0)
512			goto cleanup;
513
514		sema_wait(&request->synch_sema);
515
516		if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO) {
517			ret = EINVAL;
518			goto cleanup;
519		}
520		if (vstor_packet->status == 0) {
521			vmstor_proto_version =
522				vmstor_proto_list[i].proto_version;
523			sense_buffer_size =
524				vmstor_proto_list[i].sense_buffer_size;
525			vmscsi_size_delta =
526				vmstor_proto_list[i].vmscsi_size_delta;
527			break;
528		}
529	}
530
531	if (vstor_packet->status != 0) {
532		ret = EINVAL;
533		goto cleanup;
534	}
535	/**
536	 * Query channel properties
537	 */
538	memset(vstor_packet, 0, sizeof(struct vstor_packet));
539	vstor_packet->operation = VSTOR_OPERATION_QUERYPROPERTIES;
540	vstor_packet->flags = REQUEST_COMPLETION_FLAG;
541
542	ret = vmbus_chan_send(sc->hs_chan,
543	    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
544	    vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
545
546	if ( ret != 0)
547		goto cleanup;
548
549	sema_wait(&request->synch_sema);
550
551	/* TODO: Check returned version */
552	if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
553	    vstor_packet->status != 0) {
554		goto cleanup;
555	}
556
557	max_subch = vstor_packet->u.chan_props.max_channel_cnt;
558	if (hv_storvsc_chan_cnt > 0 && hv_storvsc_chan_cnt < (max_subch + 1))
559		max_subch = hv_storvsc_chan_cnt - 1;
560
561	/* multi-channels feature is supported by WIN8 and above version */
562	version = VMBUS_GET_VERSION(device_get_parent(sc->hs_dev), sc->hs_dev);
563	if (version != VMBUS_VERSION_WIN7 && version != VMBUS_VERSION_WS2008 &&
564	    (vstor_packet->u.chan_props.flags &
565	     HV_STORAGE_SUPPORTS_MULTI_CHANNEL)) {
566		support_multichannel = TRUE;
567	}
568	if (bootverbose) {
569		device_printf(sc->hs_dev, "max chans %d%s\n", max_subch + 1,
570		    support_multichannel ? ", multi-chan capable" : "");
571	}
572
573	memset(vstor_packet, 0, sizeof(struct vstor_packet));
574	vstor_packet->operation = VSTOR_OPERATION_ENDINITIALIZATION;
575	vstor_packet->flags = REQUEST_COMPLETION_FLAG;
576
577	ret = vmbus_chan_send(sc->hs_chan,
578	    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
579	    vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
580
581	if (ret != 0) {
582		goto cleanup;
583	}
584
585	sema_wait(&request->synch_sema);
586
587	if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO ||
588	    vstor_packet->status != 0)
589		goto cleanup;
590
591	/*
592	 * If multi-channel is supported, send multichannel create
593	 * request to host.
594	 */
595	if (support_multichannel && max_subch > 0)
596		storvsc_send_multichannel_request(sc, max_subch);
597cleanup:
598	sema_destroy(&request->synch_sema);
599	return (ret);
600}
601
602/**
603 * @brief Open channel connection to paraent partition StorVSP driver
604 *
605 * Open and initialize channel connection to parent partition StorVSP driver.
606 *
607 * @param pointer to a Hyper-V device
608 * @returns 0 on success, non-zero error on failure
609 */
610static int
611hv_storvsc_connect_vsp(struct storvsc_softc *sc)
612{
613	int ret = 0;
614	struct vmstor_chan_props props;
615
616	memset(&props, 0, sizeof(struct vmstor_chan_props));
617
618	/*
619	 * Open the channel
620	 */
621	vmbus_chan_cpu_rr(sc->hs_chan);
622	ret = vmbus_chan_open(
623		sc->hs_chan,
624		sc->hs_drv_props->drv_ringbuffer_size,
625		sc->hs_drv_props->drv_ringbuffer_size,
626		(void *)&props,
627		sizeof(struct vmstor_chan_props),
628		hv_storvsc_on_channel_callback, sc);
629
630	if (ret != 0) {
631		return ret;
632	}
633
634	ret = hv_storvsc_channel_init(sc);
635	return (ret);
636}
637
638#if HVS_HOST_RESET
639static int
640hv_storvsc_host_reset(struct storvsc_softc *sc)
641{
642	int ret = 0;
643
644	struct hv_storvsc_request *request;
645	struct vstor_packet *vstor_packet;
646
647	request = &sc->hs_reset_req;
648	request->softc = sc;
649	vstor_packet = &request->vstor_packet;
650
651	sema_init(&request->synch_sema, 0, "stor synch sema");
652
653	vstor_packet->operation = VSTOR_OPERATION_RESETBUS;
654	vstor_packet->flags = REQUEST_COMPLETION_FLAG;
655
656	ret = vmbus_chan_send(dev->channel,
657	    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
658	    vstor_packet, VSTOR_PKT_SIZE,
659	    (uint64_t)(uintptr_t)&sc->hs_reset_req);
660
661	if (ret != 0) {
662		goto cleanup;
663	}
664
665	sema_wait(&request->synch_sema);
666
667	/*
668	 * At this point, all outstanding requests in the adapter
669	 * should have been flushed out and return to us
670	 */
671
672cleanup:
673	sema_destroy(&request->synch_sema);
674	return (ret);
675}
676#endif /* HVS_HOST_RESET */
677
678/**
679 * @brief Function to initiate an I/O request
680 *
681 * @param device Hyper-V device pointer
682 * @param request pointer to a request structure
683 * @returns 0 on success, non-zero error on failure
684 */
685static int
686hv_storvsc_io_request(struct storvsc_softc *sc,
687					  struct hv_storvsc_request *request)
688{
689	struct vstor_packet *vstor_packet = &request->vstor_packet;
690	struct vmbus_channel* outgoing_channel = NULL;
691	int ret = 0, ch_sel;
692
693	vstor_packet->flags |= REQUEST_COMPLETION_FLAG;
694
695	vstor_packet->u.vm_srb.length =
696	    sizeof(struct vmscsi_req) - vmscsi_size_delta;
697
698	vstor_packet->u.vm_srb.sense_info_len = sense_buffer_size;
699
700	vstor_packet->u.vm_srb.transfer_len =
701	    request->prp_list.gpa_range.gpa_len;
702
703	vstor_packet->operation = VSTOR_OPERATION_EXECUTESRB;
704
705	ch_sel = (vstor_packet->u.vm_srb.lun + curcpu) % sc->hs_nchan;
706	outgoing_channel = sc->hs_sel_chan[ch_sel];
707
708	mtx_unlock(&request->softc->hs_lock);
709	if (request->prp_list.gpa_range.gpa_len) {
710		ret = vmbus_chan_send_prplist(outgoing_channel,
711		    &request->prp_list.gpa_range, request->prp_cnt,
712		    vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
713	} else {
714		ret = vmbus_chan_send(outgoing_channel,
715		    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
716		    vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request);
717	}
718	/* statistic for successful request sending on each channel */
719	if (!ret) {
720		sc->sysctl_data.chan_send_cnt[ch_sel]++;
721	}
722	mtx_lock(&request->softc->hs_lock);
723
724	if (ret != 0) {
725		printf("Unable to send packet %p ret %d", vstor_packet, ret);
726	} else {
727		atomic_add_int(&sc->hs_num_out_reqs, 1);
728	}
729
730	return (ret);
731}
732
733
734/**
735 * Process IO_COMPLETION_OPERATION and ready
736 * the result to be completed for upper layer
737 * processing by the CAM layer.
738 */
739static void
740hv_storvsc_on_iocompletion(struct storvsc_softc *sc,
741			   struct vstor_packet *vstor_packet,
742			   struct hv_storvsc_request *request)
743{
744	struct vmscsi_req *vm_srb;
745
746	vm_srb = &vstor_packet->u.vm_srb;
747
748	/*
749	 * Copy some fields of the host's response into the request structure,
750	 * because the fields will be used later in storvsc_io_done().
751	 */
752	request->vstor_packet.u.vm_srb.scsi_status = vm_srb->scsi_status;
753	request->vstor_packet.u.vm_srb.srb_status = vm_srb->srb_status;
754	request->vstor_packet.u.vm_srb.transfer_len = vm_srb->transfer_len;
755
756	if (((vm_srb->scsi_status & 0xFF) == SCSI_STATUS_CHECK_COND) &&
757			(vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID)) {
758		/* Autosense data available */
759
760		KASSERT(vm_srb->sense_info_len <= request->sense_info_len,
761				("vm_srb->sense_info_len <= "
762				 "request->sense_info_len"));
763
764		memcpy(request->sense_data, vm_srb->u.sense_data,
765			vm_srb->sense_info_len);
766
767		request->sense_info_len = vm_srb->sense_info_len;
768	}
769
770	/* Complete request by passing to the CAM layer */
771	storvsc_io_done(request);
772	atomic_subtract_int(&sc->hs_num_out_reqs, 1);
773	if (sc->hs_drain_notify && (sc->hs_num_out_reqs == 0)) {
774		sema_post(&sc->hs_drain_sema);
775	}
776}
777
778static void
779hv_storvsc_rescan_target(struct storvsc_softc *sc)
780{
781	path_id_t pathid;
782	target_id_t targetid;
783	union ccb *ccb;
784
785	pathid = cam_sim_path(sc->hs_sim);
786	targetid = CAM_TARGET_WILDCARD;
787
788	/*
789	 * Allocate a CCB and schedule a rescan.
790	 */
791	ccb = xpt_alloc_ccb_nowait();
792	if (ccb == NULL) {
793		printf("unable to alloc CCB for rescan\n");
794		return;
795	}
796
797	if (xpt_create_path(&ccb->ccb_h.path, NULL, pathid, targetid,
798	    CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
799		printf("unable to create path for rescan, pathid: %u,"
800		    "targetid: %u\n", pathid, targetid);
801		xpt_free_ccb(ccb);
802		return;
803	}
804
805	if (targetid == CAM_TARGET_WILDCARD)
806		ccb->ccb_h.func_code = XPT_SCAN_BUS;
807	else
808		ccb->ccb_h.func_code = XPT_SCAN_TGT;
809
810	xpt_rescan(ccb);
811}
812
813static void
814hv_storvsc_on_channel_callback(struct vmbus_channel *channel, void *xsc)
815{
816	int ret = 0;
817	struct storvsc_softc *sc = xsc;
818	uint32_t bytes_recvd;
819	uint64_t request_id;
820	uint8_t packet[roundup2(sizeof(struct vstor_packet), 8)];
821	struct hv_storvsc_request *request;
822	struct vstor_packet *vstor_packet;
823
824	bytes_recvd = roundup2(VSTOR_PKT_SIZE, 8);
825	ret = vmbus_chan_recv(channel, packet, &bytes_recvd, &request_id);
826	KASSERT(ret != ENOBUFS, ("storvsc recvbuf is not large enough"));
827	/* XXX check bytes_recvd to make sure that it contains enough data */
828
829	while ((ret == 0) && (bytes_recvd > 0)) {
830		request = (struct hv_storvsc_request *)(uintptr_t)request_id;
831
832		if ((request == &sc->hs_init_req) ||
833			(request == &sc->hs_reset_req)) {
834			memcpy(&request->vstor_packet, packet,
835				   sizeof(struct vstor_packet));
836			sema_post(&request->synch_sema);
837		} else {
838			vstor_packet = (struct vstor_packet *)packet;
839			switch(vstor_packet->operation) {
840			case VSTOR_OPERATION_COMPLETEIO:
841				if (request == NULL)
842					panic("VMBUS: storvsc received a "
843					    "packet with NULL request id in "
844					    "COMPLETEIO operation.");
845
846				hv_storvsc_on_iocompletion(sc,
847							vstor_packet, request);
848				break;
849			case VSTOR_OPERATION_REMOVEDEVICE:
850				printf("VMBUS: storvsc operation %d not "
851				    "implemented.\n", vstor_packet->operation);
852				/* TODO: implement */
853				break;
854			case VSTOR_OPERATION_ENUMERATE_BUS:
855				hv_storvsc_rescan_target(sc);
856				break;
857			default:
858				break;
859			}
860		}
861
862		bytes_recvd = roundup2(VSTOR_PKT_SIZE, 8),
863		ret = vmbus_chan_recv(channel, packet, &bytes_recvd,
864		    &request_id);
865		KASSERT(ret != ENOBUFS,
866		    ("storvsc recvbuf is not large enough"));
867		/*
868		 * XXX check bytes_recvd to make sure that it contains
869		 * enough data
870		 */
871	}
872}
873
874/**
875 * @brief StorVSC probe function
876 *
877 * Device probe function.  Returns 0 if the input device is a StorVSC
878 * device.  Otherwise, a ENXIO is returned.  If the input device is
879 * for BlkVSC (paravirtual IDE) device and this support is disabled in
880 * favor of the emulated ATA/IDE device, return ENXIO.
881 *
882 * @param a device
883 * @returns 0 on success, ENXIO if not a matcing StorVSC device
884 */
885static int
886storvsc_probe(device_t dev)
887{
888	int ret	= ENXIO;
889
890	switch (storvsc_get_storage_type(dev)) {
891	case DRIVER_BLKVSC:
892		if(bootverbose)
893			device_printf(dev,
894			    "Enlightened ATA/IDE detected\n");
895		device_set_desc(dev, g_drv_props_table[DRIVER_BLKVSC].drv_desc);
896		ret = BUS_PROBE_DEFAULT;
897		break;
898	case DRIVER_STORVSC:
899		if(bootverbose)
900			device_printf(dev, "Enlightened SCSI device detected\n");
901		device_set_desc(dev, g_drv_props_table[DRIVER_STORVSC].drv_desc);
902		ret = BUS_PROBE_DEFAULT;
903		break;
904	default:
905		ret = ENXIO;
906	}
907	return (ret);
908}
909
910static void
911storvsc_create_chan_sel(struct storvsc_softc *sc)
912{
913	struct vmbus_channel **subch;
914	int i, nsubch;
915
916	sc->hs_sel_chan[0] = sc->hs_chan;
917	nsubch = sc->hs_nchan - 1;
918	if (nsubch == 0)
919		return;
920
921	subch = vmbus_subchan_get(sc->hs_chan, nsubch);
922	for (i = 0; i < nsubch; i++)
923		sc->hs_sel_chan[i + 1] = subch[i];
924	vmbus_subchan_rel(subch, nsubch);
925}
926
927static int
928storvsc_init_requests(device_t dev)
929{
930	struct storvsc_softc *sc = device_get_softc(dev);
931	struct hv_storvsc_request *reqp;
932	int error, i;
933
934	LIST_INIT(&sc->hs_free_list);
935
936	error = bus_dma_tag_create(
937		bus_get_dma_tag(dev),		/* parent */
938		1,				/* alignment */
939		PAGE_SIZE,			/* boundary */
940		BUS_SPACE_MAXADDR,		/* lowaddr */
941		BUS_SPACE_MAXADDR,		/* highaddr */
942		NULL, NULL,			/* filter, filterarg */
943		STORVSC_DATA_SIZE_MAX,		/* maxsize */
944		STORVSC_DATA_SEGCNT_MAX,	/* nsegments */
945		STORVSC_DATA_SEGSZ_MAX,		/* maxsegsize */
946		0,				/* flags */
947		NULL,				/* lockfunc */
948		NULL,				/* lockfuncarg */
949		&sc->storvsc_req_dtag);
950	if (error) {
951		device_printf(dev, "failed to create storvsc dma tag\n");
952		return (error);
953	}
954
955	for (i = 0; i < sc->hs_drv_props->drv_max_ios_per_target; ++i) {
956		reqp = malloc(sizeof(struct hv_storvsc_request),
957				 M_DEVBUF, M_WAITOK|M_ZERO);
958		reqp->softc = sc;
959		error = bus_dmamap_create(sc->storvsc_req_dtag, 0,
960				&reqp->data_dmap);
961		if (error) {
962			device_printf(dev, "failed to allocate storvsc "
963			    "data dmamap\n");
964			goto cleanup;
965		}
966		LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link);
967	}
968	return (0);
969
970cleanup:
971	while ((reqp = LIST_FIRST(&sc->hs_free_list)) != NULL) {
972		LIST_REMOVE(reqp, link);
973		bus_dmamap_destroy(sc->storvsc_req_dtag, reqp->data_dmap);
974		free(reqp, M_DEVBUF);
975	}
976	return (error);
977}
978
979static void
980storvsc_sysctl(device_t dev)
981{
982	struct sysctl_oid_list *child;
983	struct sysctl_ctx_list *ctx;
984	struct sysctl_oid *ch_tree, *chid_tree;
985	struct storvsc_softc *sc;
986	char name[16];
987	int i;
988
989	sc = device_get_softc(dev);
990	ctx = device_get_sysctl_ctx(dev);
991	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
992
993	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "data_bio_cnt", CTLFLAG_RW,
994		&sc->sysctl_data.data_bio_cnt, "# of bio data block");
995	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "data_vaddr_cnt", CTLFLAG_RW,
996		&sc->sysctl_data.data_vaddr_cnt, "# of vaddr data block");
997	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "data_sg_cnt", CTLFLAG_RW,
998		&sc->sysctl_data.data_sg_cnt, "# of sg data block");
999
1000	/* dev.storvsc.UNIT.channel */
1001	ch_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "channel",
1002		CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1003	if (ch_tree == NULL)
1004		return;
1005
1006	for (i = 0; i < sc->hs_nchan; i++) {
1007		uint32_t ch_id;
1008
1009		ch_id = vmbus_chan_id(sc->hs_sel_chan[i]);
1010		snprintf(name, sizeof(name), "%d", ch_id);
1011		/* dev.storvsc.UNIT.channel.CHID */
1012		chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree),
1013			OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
1014		if (chid_tree == NULL)
1015			return;
1016		/* dev.storvsc.UNIT.channel.CHID.send_req */
1017		SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO,
1018			"send_req", CTLFLAG_RD, &sc->sysctl_data.chan_send_cnt[i],
1019			"# of request sending from this channel");
1020	}
1021}
1022
1023/**
1024 * @brief StorVSC attach function
1025 *
1026 * Function responsible for allocating per-device structures,
1027 * setting up CAM interfaces and scanning for available LUNs to
1028 * be used for SCSI device peripherals.
1029 *
1030 * @param a device
1031 * @returns 0 on success or an error on failure
1032 */
1033static int
1034storvsc_attach(device_t dev)
1035{
1036	enum hv_storage_type stor_type;
1037	struct storvsc_softc *sc;
1038	struct cam_devq *devq;
1039	int ret, i, j;
1040	struct hv_storvsc_request *reqp;
1041	struct root_hold_token *root_mount_token = NULL;
1042	struct hv_sgl_node *sgl_node = NULL;
1043	void *tmp_buff = NULL;
1044
1045	/*
1046	 * We need to serialize storvsc attach calls.
1047	 */
1048	root_mount_token = root_mount_hold("storvsc");
1049
1050	sc = device_get_softc(dev);
1051	sc->hs_nchan = 1;
1052	sc->hs_chan = vmbus_get_channel(dev);
1053
1054	stor_type = storvsc_get_storage_type(dev);
1055
1056	if (stor_type == DRIVER_UNKNOWN) {
1057		ret = ENODEV;
1058		goto cleanup;
1059	}
1060
1061	/* fill in driver specific properties */
1062	sc->hs_drv_props = &g_drv_props_table[stor_type];
1063	sc->hs_drv_props->drv_ringbuffer_size = hv_storvsc_ringbuffer_size;
1064	sc->hs_drv_props->drv_max_ios_per_target =
1065		MIN(STORVSC_MAX_IO, hv_storvsc_max_io);
1066	if (bootverbose) {
1067		printf("storvsc ringbuffer size: %d, max_io: %d\n",
1068			sc->hs_drv_props->drv_ringbuffer_size,
1069			sc->hs_drv_props->drv_max_ios_per_target);
1070	}
1071	/* fill in device specific properties */
1072	sc->hs_unit	= device_get_unit(dev);
1073	sc->hs_dev	= dev;
1074
1075	mtx_init(&sc->hs_lock, "hvslck", NULL, MTX_DEF);
1076
1077	ret = storvsc_init_requests(dev);
1078	if (ret != 0)
1079		goto cleanup;
1080
1081	/* create sg-list page pool */
1082	if (FALSE == g_hv_sgl_page_pool.is_init) {
1083		g_hv_sgl_page_pool.is_init = TRUE;
1084		LIST_INIT(&g_hv_sgl_page_pool.in_use_sgl_list);
1085		LIST_INIT(&g_hv_sgl_page_pool.free_sgl_list);
1086
1087		/*
1088		 * Pre-create SG list, each SG list with
1089		 * STORVSC_DATA_SEGCNT_MAX segments, each
1090		 * segment has one page buffer
1091		 */
1092		for (i = 0; i < sc->hs_drv_props->drv_max_ios_per_target; i++) {
1093	        	sgl_node = malloc(sizeof(struct hv_sgl_node),
1094			    M_DEVBUF, M_WAITOK|M_ZERO);
1095
1096			sgl_node->sgl_data =
1097			    sglist_alloc(STORVSC_DATA_SEGCNT_MAX,
1098			    M_WAITOK|M_ZERO);
1099
1100			for (j = 0; j < STORVSC_DATA_SEGCNT_MAX; j++) {
1101				tmp_buff = malloc(PAGE_SIZE,
1102				    M_DEVBUF, M_WAITOK|M_ZERO);
1103
1104				sgl_node->sgl_data->sg_segs[j].ss_paddr =
1105				    (vm_paddr_t)tmp_buff;
1106			}
1107
1108			LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list,
1109			    sgl_node, link);
1110		}
1111	}
1112
1113	sc->hs_destroy = FALSE;
1114	sc->hs_drain_notify = FALSE;
1115	sema_init(&sc->hs_drain_sema, 0, "Store Drain Sema");
1116
1117	ret = hv_storvsc_connect_vsp(sc);
1118	if (ret != 0) {
1119		goto cleanup;
1120	}
1121
1122	/* Construct cpu to channel mapping */
1123	storvsc_create_chan_sel(sc);
1124
1125	/*
1126	 * Create the device queue.
1127	 * Hyper-V maps each target to one SCSI HBA
1128	 */
1129	devq = cam_simq_alloc(sc->hs_drv_props->drv_max_ios_per_target);
1130	if (devq == NULL) {
1131		device_printf(dev, "Failed to alloc device queue\n");
1132		ret = ENOMEM;
1133		goto cleanup;
1134	}
1135
1136	sc->hs_sim = cam_sim_alloc(storvsc_action,
1137				storvsc_poll,
1138				sc->hs_drv_props->drv_name,
1139				sc,
1140				sc->hs_unit,
1141				&sc->hs_lock, 1,
1142				sc->hs_drv_props->drv_max_ios_per_target,
1143				devq);
1144
1145	if (sc->hs_sim == NULL) {
1146		device_printf(dev, "Failed to alloc sim\n");
1147		cam_simq_free(devq);
1148		ret = ENOMEM;
1149		goto cleanup;
1150	}
1151
1152	mtx_lock(&sc->hs_lock);
1153	/* bus_id is set to 0, need to get it from VMBUS channel query? */
1154	if (xpt_bus_register(sc->hs_sim, dev, 0) != CAM_SUCCESS) {
1155		cam_sim_free(sc->hs_sim, /*free_devq*/TRUE);
1156		mtx_unlock(&sc->hs_lock);
1157		device_printf(dev, "Unable to register SCSI bus\n");
1158		ret = ENXIO;
1159		goto cleanup;
1160	}
1161
1162	if (xpt_create_path(&sc->hs_path, /*periph*/NULL,
1163		 cam_sim_path(sc->hs_sim),
1164		CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
1165		xpt_bus_deregister(cam_sim_path(sc->hs_sim));
1166		cam_sim_free(sc->hs_sim, /*free_devq*/TRUE);
1167		mtx_unlock(&sc->hs_lock);
1168		device_printf(dev, "Unable to create path\n");
1169		ret = ENXIO;
1170		goto cleanup;
1171	}
1172
1173	mtx_unlock(&sc->hs_lock);
1174
1175	storvsc_sysctl(dev);
1176
1177	root_mount_rel(root_mount_token);
1178	return (0);
1179
1180
1181cleanup:
1182	root_mount_rel(root_mount_token);
1183	while (!LIST_EMPTY(&sc->hs_free_list)) {
1184		reqp = LIST_FIRST(&sc->hs_free_list);
1185		LIST_REMOVE(reqp, link);
1186		bus_dmamap_destroy(sc->storvsc_req_dtag, reqp->data_dmap);
1187		free(reqp, M_DEVBUF);
1188	}
1189
1190	while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) {
1191		sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
1192		LIST_REMOVE(sgl_node, link);
1193		for (j = 0; j < STORVSC_DATA_SEGCNT_MAX; j++) {
1194			if (NULL !=
1195			    (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) {
1196				free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF);
1197			}
1198		}
1199		sglist_free(sgl_node->sgl_data);
1200		free(sgl_node, M_DEVBUF);
1201	}
1202
1203	return (ret);
1204}
1205
1206/**
1207 * @brief StorVSC device detach function
1208 *
1209 * This function is responsible for safely detaching a
1210 * StorVSC device.  This includes waiting for inbound responses
1211 * to complete and freeing associated per-device structures.
1212 *
1213 * @param dev a device
1214 * returns 0 on success
1215 */
1216static int
1217storvsc_detach(device_t dev)
1218{
1219	struct storvsc_softc *sc = device_get_softc(dev);
1220	struct hv_storvsc_request *reqp = NULL;
1221	struct hv_sgl_node *sgl_node = NULL;
1222	int j = 0;
1223
1224	sc->hs_destroy = TRUE;
1225
1226	/*
1227	 * At this point, all outbound traffic should be disabled. We
1228	 * only allow inbound traffic (responses) to proceed so that
1229	 * outstanding requests can be completed.
1230	 */
1231
1232	sc->hs_drain_notify = TRUE;
1233	sema_wait(&sc->hs_drain_sema);
1234	sc->hs_drain_notify = FALSE;
1235
1236	/*
1237	 * Since we have already drained, we don't need to busy wait.
1238	 * The call to close the channel will reset the callback
1239	 * under the protection of the incoming channel lock.
1240	 */
1241
1242	vmbus_chan_close(sc->hs_chan);
1243
1244	mtx_lock(&sc->hs_lock);
1245	while (!LIST_EMPTY(&sc->hs_free_list)) {
1246		reqp = LIST_FIRST(&sc->hs_free_list);
1247		LIST_REMOVE(reqp, link);
1248		bus_dmamap_destroy(sc->storvsc_req_dtag, reqp->data_dmap);
1249		free(reqp, M_DEVBUF);
1250	}
1251	mtx_unlock(&sc->hs_lock);
1252
1253	while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) {
1254		sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
1255		LIST_REMOVE(sgl_node, link);
1256		for (j = 0; j < STORVSC_DATA_SEGCNT_MAX; j++){
1257			if (NULL !=
1258			    (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) {
1259				free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF);
1260			}
1261		}
1262		sglist_free(sgl_node->sgl_data);
1263		free(sgl_node, M_DEVBUF);
1264	}
1265
1266	return (0);
1267}
1268
1269#if HVS_TIMEOUT_TEST
1270/**
1271 * @brief unit test for timed out operations
1272 *
1273 * This function provides unit testing capability to simulate
1274 * timed out operations.  Recompilation with HV_TIMEOUT_TEST=1
1275 * is required.
1276 *
1277 * @param reqp pointer to a request structure
1278 * @param opcode SCSI operation being performed
1279 * @param wait if 1, wait for I/O to complete
1280 */
1281static void
1282storvsc_timeout_test(struct hv_storvsc_request *reqp,
1283		uint8_t opcode, int wait)
1284{
1285	int ret;
1286	union ccb *ccb = reqp->ccb;
1287	struct storvsc_softc *sc = reqp->softc;
1288
1289	if (reqp->vstor_packet.vm_srb.cdb[0] != opcode) {
1290		return;
1291	}
1292
1293	if (wait) {
1294		mtx_lock(&reqp->event.mtx);
1295	}
1296	ret = hv_storvsc_io_request(sc, reqp);
1297	if (ret != 0) {
1298		if (wait) {
1299			mtx_unlock(&reqp->event.mtx);
1300		}
1301		printf("%s: io_request failed with %d.\n",
1302				__func__, ret);
1303		ccb->ccb_h.status = CAM_PROVIDE_FAIL;
1304		mtx_lock(&sc->hs_lock);
1305		storvsc_free_request(sc, reqp);
1306		xpt_done(ccb);
1307		mtx_unlock(&sc->hs_lock);
1308		return;
1309	}
1310
1311	if (wait) {
1312		xpt_print(ccb->ccb_h.path,
1313				"%u: %s: waiting for IO return.\n",
1314				ticks, __func__);
1315		ret = cv_timedwait(&reqp->event.cv, &reqp->event.mtx, 60*hz);
1316		mtx_unlock(&reqp->event.mtx);
1317		xpt_print(ccb->ccb_h.path, "%u: %s: %s.\n",
1318				ticks, __func__, (ret == 0)?
1319				"IO return detected" :
1320				"IO return not detected");
1321		/*
1322		 * Now both the timer handler and io done are running
1323		 * simultaneously. We want to confirm the io done always
1324		 * finishes after the timer handler exits. So reqp used by
1325		 * timer handler is not freed or stale. Do busy loop for
1326		 * another 1/10 second to make sure io done does
1327		 * wait for the timer handler to complete.
1328		 */
1329		DELAY(100*1000);
1330		mtx_lock(&sc->hs_lock);
1331		xpt_print(ccb->ccb_h.path,
1332				"%u: %s: finishing, queue frozen %d, "
1333				"ccb status 0x%x scsi_status 0x%x.\n",
1334				ticks, __func__, sc->hs_frozen,
1335				ccb->ccb_h.status,
1336				ccb->csio.scsi_status);
1337		mtx_unlock(&sc->hs_lock);
1338	}
1339}
1340#endif /* HVS_TIMEOUT_TEST */
1341
1342#ifdef notyet
1343/**
1344 * @brief timeout handler for requests
1345 *
1346 * This function is called as a result of a callout expiring.
1347 *
1348 * @param arg pointer to a request
1349 */
1350static void
1351storvsc_timeout(void *arg)
1352{
1353	struct hv_storvsc_request *reqp = arg;
1354	struct storvsc_softc *sc = reqp->softc;
1355	union ccb *ccb = reqp->ccb;
1356
1357	if (reqp->retries == 0) {
1358		mtx_lock(&sc->hs_lock);
1359		xpt_print(ccb->ccb_h.path,
1360		    "%u: IO timed out (req=0x%p), wait for another %u secs.\n",
1361		    ticks, reqp, ccb->ccb_h.timeout / 1000);
1362		cam_error_print(ccb, CAM_ESF_ALL, CAM_EPF_ALL);
1363		mtx_unlock(&sc->hs_lock);
1364
1365		reqp->retries++;
1366		callout_reset_sbt(&reqp->callout, SBT_1MS * ccb->ccb_h.timeout,
1367		    0, storvsc_timeout, reqp, 0);
1368#if HVS_TIMEOUT_TEST
1369		storvsc_timeout_test(reqp, SEND_DIAGNOSTIC, 0);
1370#endif
1371		return;
1372	}
1373
1374	mtx_lock(&sc->hs_lock);
1375	xpt_print(ccb->ccb_h.path,
1376		"%u: IO (reqp = 0x%p) did not return for %u seconds, %s.\n",
1377		ticks, reqp, ccb->ccb_h.timeout * (reqp->retries+1) / 1000,
1378		(sc->hs_frozen == 0)?
1379		"freezing the queue" : "the queue is already frozen");
1380	if (sc->hs_frozen == 0) {
1381		sc->hs_frozen = 1;
1382		xpt_freeze_simq(xpt_path_sim(ccb->ccb_h.path), 1);
1383	}
1384	mtx_unlock(&sc->hs_lock);
1385
1386#if HVS_TIMEOUT_TEST
1387	storvsc_timeout_test(reqp, MODE_SELECT_10, 1);
1388#endif
1389}
1390#endif
1391
1392/**
1393 * @brief StorVSC device poll function
1394 *
1395 * This function is responsible for servicing requests when
1396 * interrupts are disabled (i.e when we are dumping core.)
1397 *
1398 * @param sim a pointer to a CAM SCSI interface module
1399 */
1400static void
1401storvsc_poll(struct cam_sim *sim)
1402{
1403	struct storvsc_softc *sc = cam_sim_softc(sim);
1404
1405	mtx_assert(&sc->hs_lock, MA_OWNED);
1406	mtx_unlock(&sc->hs_lock);
1407	hv_storvsc_on_channel_callback(sc->hs_chan, sc);
1408	mtx_lock(&sc->hs_lock);
1409}
1410
1411/**
1412 * @brief StorVSC device action function
1413 *
1414 * This function is responsible for handling SCSI operations which
1415 * are passed from the CAM layer.  The requests are in the form of
1416 * CAM control blocks which indicate the action being performed.
1417 * Not all actions require converting the request to a VSCSI protocol
1418 * message - these actions can be responded to by this driver.
1419 * Requests which are destined for a backend storage device are converted
1420 * to a VSCSI protocol message and sent on the channel connection associated
1421 * with this device.
1422 *
1423 * @param sim pointer to a CAM SCSI interface module
1424 * @param ccb pointer to a CAM control block
1425 */
1426static void
1427storvsc_action(struct cam_sim *sim, union ccb *ccb)
1428{
1429	struct storvsc_softc *sc = cam_sim_softc(sim);
1430	int res;
1431
1432	mtx_assert(&sc->hs_lock, MA_OWNED);
1433	switch (ccb->ccb_h.func_code) {
1434	case XPT_PATH_INQ: {
1435		struct ccb_pathinq *cpi = &ccb->cpi;
1436
1437		cpi->version_num = 1;
1438		cpi->hba_inquiry = PI_TAG_ABLE|PI_SDTR_ABLE;
1439		cpi->target_sprt = 0;
1440		cpi->hba_misc = PIM_NOBUSRESET;
1441		if (hv_storvsc_use_pim_unmapped)
1442			cpi->hba_misc |= PIM_UNMAPPED;
1443		cpi->maxio = STORVSC_DATA_SIZE_MAX;
1444		cpi->hba_eng_cnt = 0;
1445		cpi->max_target = STORVSC_MAX_TARGETS;
1446		cpi->max_lun = sc->hs_drv_props->drv_max_luns_per_target;
1447		cpi->initiator_id = cpi->max_target;
1448		cpi->bus_id = cam_sim_bus(sim);
1449		cpi->base_transfer_speed = 300000;
1450		cpi->transport = XPORT_SAS;
1451		cpi->transport_version = 0;
1452		cpi->protocol = PROTO_SCSI;
1453		cpi->protocol_version = SCSI_REV_SPC2;
1454		strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
1455		strlcpy(cpi->hba_vid, sc->hs_drv_props->drv_name, HBA_IDLEN);
1456		strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
1457		cpi->unit_number = cam_sim_unit(sim);
1458
1459		ccb->ccb_h.status = CAM_REQ_CMP;
1460		xpt_done(ccb);
1461		return;
1462	}
1463	case XPT_GET_TRAN_SETTINGS: {
1464		struct  ccb_trans_settings *cts = &ccb->cts;
1465
1466		cts->transport = XPORT_SAS;
1467		cts->transport_version = 0;
1468		cts->protocol = PROTO_SCSI;
1469		cts->protocol_version = SCSI_REV_SPC2;
1470
1471		/* enable tag queuing and disconnected mode */
1472		cts->proto_specific.valid = CTS_SCSI_VALID_TQ;
1473		cts->proto_specific.scsi.valid = CTS_SCSI_VALID_TQ;
1474		cts->proto_specific.scsi.flags = CTS_SCSI_FLAGS_TAG_ENB;
1475		cts->xport_specific.valid = CTS_SPI_VALID_DISC;
1476		cts->xport_specific.spi.flags = CTS_SPI_FLAGS_DISC_ENB;
1477
1478		ccb->ccb_h.status = CAM_REQ_CMP;
1479		xpt_done(ccb);
1480		return;
1481	}
1482	case XPT_SET_TRAN_SETTINGS:	{
1483		ccb->ccb_h.status = CAM_REQ_CMP;
1484		xpt_done(ccb);
1485		return;
1486	}
1487	case XPT_CALC_GEOMETRY:{
1488		cam_calc_geometry(&ccb->ccg, 1);
1489		xpt_done(ccb);
1490		return;
1491	}
1492	case  XPT_RESET_BUS:
1493	case  XPT_RESET_DEV:{
1494#if HVS_HOST_RESET
1495		if ((res = hv_storvsc_host_reset(sc)) != 0) {
1496			xpt_print(ccb->ccb_h.path,
1497				"hv_storvsc_host_reset failed with %d\n", res);
1498			ccb->ccb_h.status = CAM_PROVIDE_FAIL;
1499			xpt_done(ccb);
1500			return;
1501		}
1502		ccb->ccb_h.status = CAM_REQ_CMP;
1503		xpt_done(ccb);
1504		return;
1505#else
1506		xpt_print(ccb->ccb_h.path,
1507				  "%s reset not supported.\n",
1508				  (ccb->ccb_h.func_code == XPT_RESET_BUS)?
1509				  "bus" : "dev");
1510		ccb->ccb_h.status = CAM_REQ_INVALID;
1511		xpt_done(ccb);
1512		return;
1513#endif	/* HVS_HOST_RESET */
1514	}
1515	case XPT_SCSI_IO:
1516	case XPT_IMMED_NOTIFY: {
1517		struct hv_storvsc_request *reqp = NULL;
1518		bus_dmamap_t dmap_saved;
1519
1520		if (ccb->csio.cdb_len == 0) {
1521			panic("cdl_len is 0\n");
1522		}
1523
1524		if (LIST_EMPTY(&sc->hs_free_list)) {
1525			ccb->ccb_h.status = CAM_REQUEUE_REQ;
1526			if (sc->hs_frozen == 0) {
1527				sc->hs_frozen = 1;
1528				xpt_freeze_simq(sim, /* count*/1);
1529			}
1530			xpt_done(ccb);
1531			return;
1532		}
1533
1534		reqp = LIST_FIRST(&sc->hs_free_list);
1535		LIST_REMOVE(reqp, link);
1536
1537		/* Save the data_dmap before reset request */
1538		dmap_saved = reqp->data_dmap;
1539
1540		/* XXX this is ugly */
1541		bzero(reqp, sizeof(struct hv_storvsc_request));
1542
1543		/* Restore necessary bits */
1544		reqp->data_dmap = dmap_saved;
1545		reqp->softc = sc;
1546
1547		ccb->ccb_h.status |= CAM_SIM_QUEUED;
1548		if ((res = create_storvsc_request(ccb, reqp)) != 0) {
1549			ccb->ccb_h.status = CAM_REQ_INVALID;
1550			xpt_done(ccb);
1551			return;
1552		}
1553
1554#ifdef notyet
1555		if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) {
1556			callout_init(&reqp->callout, 1);
1557			callout_reset_sbt(&reqp->callout,
1558			    SBT_1MS * ccb->ccb_h.timeout, 0,
1559			    storvsc_timeout, reqp, 0);
1560#if HVS_TIMEOUT_TEST
1561			cv_init(&reqp->event.cv, "storvsc timeout cv");
1562			mtx_init(&reqp->event.mtx, "storvsc timeout mutex",
1563					NULL, MTX_DEF);
1564			switch (reqp->vstor_packet.vm_srb.cdb[0]) {
1565				case MODE_SELECT_10:
1566				case SEND_DIAGNOSTIC:
1567					/* To have timer send the request. */
1568					return;
1569				default:
1570					break;
1571			}
1572#endif /* HVS_TIMEOUT_TEST */
1573		}
1574#endif
1575
1576		if ((res = hv_storvsc_io_request(sc, reqp)) != 0) {
1577			xpt_print(ccb->ccb_h.path,
1578				"hv_storvsc_io_request failed with %d\n", res);
1579			ccb->ccb_h.status = CAM_PROVIDE_FAIL;
1580			storvsc_free_request(sc, reqp);
1581			xpt_done(ccb);
1582			return;
1583		}
1584		return;
1585	}
1586
1587	default:
1588		ccb->ccb_h.status = CAM_REQ_INVALID;
1589		xpt_done(ccb);
1590		return;
1591	}
1592}
1593
1594/**
1595 * @brief destroy bounce buffer
1596 *
1597 * This function is responsible for destroy a Scatter/Gather list
1598 * that create by storvsc_create_bounce_buffer()
1599 *
1600 * @param sgl- the Scatter/Gather need be destroy
1601 * @param sg_count- page count of the SG list.
1602 *
1603 */
1604static void
1605storvsc_destroy_bounce_buffer(struct sglist *sgl)
1606{
1607	struct hv_sgl_node *sgl_node = NULL;
1608	if (LIST_EMPTY(&g_hv_sgl_page_pool.in_use_sgl_list)) {
1609		printf("storvsc error: not enough in use sgl\n");
1610		return;
1611	}
1612	sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list);
1613	LIST_REMOVE(sgl_node, link);
1614	sgl_node->sgl_data = sgl;
1615	LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link);
1616}
1617
1618/**
1619 * @brief create bounce buffer
1620 *
1621 * This function is responsible for create a Scatter/Gather list,
1622 * which hold several pages that can be aligned with page size.
1623 *
1624 * @param seg_count- SG-list segments count
1625 * @param write - if WRITE_TYPE, set SG list page used size to 0,
1626 * otherwise set used size to page size.
1627 *
1628 * return NULL if create failed
1629 */
1630static struct sglist *
1631storvsc_create_bounce_buffer(uint16_t seg_count, int write)
1632{
1633	int i = 0;
1634	struct sglist *bounce_sgl = NULL;
1635	unsigned int buf_len = ((write == WRITE_TYPE) ? 0 : PAGE_SIZE);
1636	struct hv_sgl_node *sgl_node = NULL;
1637
1638	/* get struct sglist from free_sgl_list */
1639	if (LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) {
1640		printf("storvsc error: not enough free sgl\n");
1641		return NULL;
1642	}
1643	sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list);
1644	LIST_REMOVE(sgl_node, link);
1645	bounce_sgl = sgl_node->sgl_data;
1646	LIST_INSERT_HEAD(&g_hv_sgl_page_pool.in_use_sgl_list, sgl_node, link);
1647
1648	bounce_sgl->sg_maxseg = seg_count;
1649
1650	if (write == WRITE_TYPE)
1651		bounce_sgl->sg_nseg = 0;
1652	else
1653		bounce_sgl->sg_nseg = seg_count;
1654
1655	for (i = 0; i < seg_count; i++)
1656	        bounce_sgl->sg_segs[i].ss_len = buf_len;
1657
1658	return bounce_sgl;
1659}
1660
1661/**
1662 * @brief copy data from SG list to bounce buffer
1663 *
1664 * This function is responsible for copy data from one SG list's segments
1665 * to another SG list which used as bounce buffer.
1666 *
1667 * @param bounce_sgl - the destination SG list
1668 * @param orig_sgl - the segment of the source SG list.
1669 * @param orig_sgl_count - the count of segments.
1670 * @param orig_sgl_count - indicate which segment need bounce buffer,
1671 *  set 1 means need.
1672 *
1673 */
1674static void
1675storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl,
1676			       bus_dma_segment_t *orig_sgl,
1677			       unsigned int orig_sgl_count,
1678			       uint64_t seg_bits)
1679{
1680	int src_sgl_idx = 0;
1681
1682	for (src_sgl_idx = 0; src_sgl_idx < orig_sgl_count; src_sgl_idx++) {
1683		if (seg_bits & (1 << src_sgl_idx)) {
1684			memcpy((void*)bounce_sgl->sg_segs[src_sgl_idx].ss_paddr,
1685			    (void*)orig_sgl[src_sgl_idx].ds_addr,
1686			    orig_sgl[src_sgl_idx].ds_len);
1687
1688			bounce_sgl->sg_segs[src_sgl_idx].ss_len =
1689			    orig_sgl[src_sgl_idx].ds_len;
1690		}
1691	}
1692}
1693
1694/**
1695 * @brief copy data from SG list which used as bounce to another SG list
1696 *
1697 * This function is responsible for copy data from one SG list with bounce
1698 * buffer to another SG list's segments.
1699 *
1700 * @param dest_sgl - the destination SG list's segments
1701 * @param dest_sgl_count - the count of destination SG list's segment.
1702 * @param src_sgl - the source SG list.
1703 * @param seg_bits - indicate which segment used bounce buffer of src SG-list.
1704 *
1705 */
1706void
1707storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl,
1708				    unsigned int dest_sgl_count,
1709				    struct sglist* src_sgl,
1710				    uint64_t seg_bits)
1711{
1712	int sgl_idx = 0;
1713
1714	for (sgl_idx = 0; sgl_idx < dest_sgl_count; sgl_idx++) {
1715		if (seg_bits & (1 << sgl_idx)) {
1716			memcpy((void*)(dest_sgl[sgl_idx].ds_addr),
1717			    (void*)(src_sgl->sg_segs[sgl_idx].ss_paddr),
1718			    src_sgl->sg_segs[sgl_idx].ss_len);
1719		}
1720	}
1721}
1722
1723/**
1724 * @brief check SG list with bounce buffer or not
1725 *
1726 * This function is responsible for check if need bounce buffer for SG list.
1727 *
1728 * @param sgl - the SG list's segments
1729 * @param sg_count - the count of SG list's segment.
1730 * @param bits - segmengs number that need bounce buffer
1731 *
1732 * return -1 if SG list needless bounce buffer
1733 */
1734static int
1735storvsc_check_bounce_buffer_sgl(bus_dma_segment_t *sgl,
1736				unsigned int sg_count,
1737				uint64_t *bits)
1738{
1739	int i = 0;
1740	int offset = 0;
1741	uint64_t phys_addr = 0;
1742	uint64_t tmp_bits = 0;
1743	boolean_t found_hole = FALSE;
1744	boolean_t pre_aligned = TRUE;
1745
1746	if (sg_count < 2){
1747		return -1;
1748	}
1749
1750	*bits = 0;
1751
1752	phys_addr = vtophys(sgl[0].ds_addr);
1753	offset =  phys_addr - trunc_page(phys_addr);
1754
1755	if (offset != 0) {
1756		pre_aligned = FALSE;
1757		tmp_bits |= 1;
1758	}
1759
1760	for (i = 1; i < sg_count; i++) {
1761		phys_addr = vtophys(sgl[i].ds_addr);
1762		offset =  phys_addr - trunc_page(phys_addr);
1763
1764		if (offset == 0) {
1765			if (FALSE == pre_aligned){
1766				/*
1767				 * This segment is aligned, if the previous
1768				 * one is not aligned, find a hole
1769				 */
1770				found_hole = TRUE;
1771			}
1772			pre_aligned = TRUE;
1773		} else {
1774			tmp_bits |= 1 << i;
1775			if (!pre_aligned) {
1776				if (phys_addr != vtophys(sgl[i-1].ds_addr +
1777				    sgl[i-1].ds_len)) {
1778					/*
1779					 * Check whether connect to previous
1780					 * segment,if not, find the hole
1781					 */
1782					found_hole = TRUE;
1783				}
1784			} else {
1785				found_hole = TRUE;
1786			}
1787			pre_aligned = FALSE;
1788		}
1789	}
1790
1791	if (!found_hole) {
1792		return (-1);
1793	} else {
1794		*bits = tmp_bits;
1795		return 0;
1796	}
1797}
1798
1799/**
1800 * Copy bus_dma segments to multiple page buffer, which requires
1801 * the pages are compact composed except for the 1st and last pages.
1802 */
1803static void
1804storvsc_xferbuf_prepare(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
1805{
1806	struct hv_storvsc_request *reqp = arg;
1807	union ccb *ccb = reqp->ccb;
1808	struct ccb_scsiio *csio = &ccb->csio;
1809	struct storvsc_gpa_range *prplist;
1810	int i;
1811
1812	prplist = &reqp->prp_list;
1813	prplist->gpa_range.gpa_len = csio->dxfer_len;
1814	prplist->gpa_range.gpa_ofs = segs[0].ds_addr & PAGE_MASK;
1815
1816	for (i = 0; i < nsegs; i++) {
1817#ifdef INVARIANTS
1818		if (nsegs > 1) {
1819			if (i == 0) {
1820				KASSERT((segs[i].ds_addr & PAGE_MASK) +
1821				    segs[i].ds_len == PAGE_SIZE,
1822				    ("invalid 1st page, ofs 0x%jx, len %zu",
1823				     (uintmax_t)segs[i].ds_addr,
1824				     segs[i].ds_len));
1825			} else if (i == nsegs - 1) {
1826				KASSERT((segs[i].ds_addr & PAGE_MASK) == 0,
1827				    ("invalid last page, ofs 0x%jx",
1828				     (uintmax_t)segs[i].ds_addr));
1829			} else {
1830				KASSERT((segs[i].ds_addr & PAGE_MASK) == 0 &&
1831				    segs[i].ds_len == PAGE_SIZE,
1832				    ("not a full page, ofs 0x%jx, len %zu",
1833				     (uintmax_t)segs[i].ds_addr,
1834				     segs[i].ds_len));
1835			}
1836		}
1837#endif
1838		prplist->gpa_page[i] = atop(segs[i].ds_addr);
1839	}
1840	reqp->prp_cnt = nsegs;
1841}
1842
1843/**
1844 * @brief Fill in a request structure based on a CAM control block
1845 *
1846 * Fills in a request structure based on the contents of a CAM control
1847 * block.  The request structure holds the payload information for
1848 * VSCSI protocol request.
1849 *
1850 * @param ccb pointer to a CAM contorl block
1851 * @param reqp pointer to a request structure
1852 */
1853static int
1854create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp)
1855{
1856	struct ccb_scsiio *csio = &ccb->csio;
1857	uint64_t phys_addr;
1858	uint32_t pfn;
1859	uint64_t not_aligned_seg_bits = 0;
1860	int error;
1861
1862	/* refer to struct vmscsi_req for meanings of these two fields */
1863	reqp->vstor_packet.u.vm_srb.port =
1864		cam_sim_unit(xpt_path_sim(ccb->ccb_h.path));
1865	reqp->vstor_packet.u.vm_srb.path_id =
1866		cam_sim_bus(xpt_path_sim(ccb->ccb_h.path));
1867
1868	reqp->vstor_packet.u.vm_srb.target_id = ccb->ccb_h.target_id;
1869	reqp->vstor_packet.u.vm_srb.lun = ccb->ccb_h.target_lun;
1870
1871	reqp->vstor_packet.u.vm_srb.cdb_len = csio->cdb_len;
1872	if(ccb->ccb_h.flags & CAM_CDB_POINTER) {
1873		memcpy(&reqp->vstor_packet.u.vm_srb.u.cdb, csio->cdb_io.cdb_ptr,
1874			csio->cdb_len);
1875	} else {
1876		memcpy(&reqp->vstor_packet.u.vm_srb.u.cdb, csio->cdb_io.cdb_bytes,
1877			csio->cdb_len);
1878	}
1879
1880	if (hv_storvsc_use_win8ext_flags) {
1881		reqp->vstor_packet.u.vm_srb.win8_extension.time_out_value = 60;
1882		reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |=
1883			SRB_FLAGS_DISABLE_SYNCH_TRANSFER;
1884	}
1885	switch (ccb->ccb_h.flags & CAM_DIR_MASK) {
1886	case CAM_DIR_OUT:
1887		reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE;
1888		if (hv_storvsc_use_win8ext_flags) {
1889			reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |=
1890				SRB_FLAGS_DATA_OUT;
1891		}
1892		break;
1893	case CAM_DIR_IN:
1894		reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE;
1895		if (hv_storvsc_use_win8ext_flags) {
1896			reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |=
1897				SRB_FLAGS_DATA_IN;
1898		}
1899		break;
1900	case CAM_DIR_NONE:
1901		reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE;
1902		if (hv_storvsc_use_win8ext_flags) {
1903			reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |=
1904				SRB_FLAGS_NO_DATA_TRANSFER;
1905		}
1906		break;
1907	default:
1908		printf("Error: unexpected data direction: 0x%x\n",
1909			ccb->ccb_h.flags & CAM_DIR_MASK);
1910		return (EINVAL);
1911	}
1912
1913	reqp->sense_data     = &csio->sense_data;
1914	reqp->sense_info_len = csio->sense_len;
1915
1916	reqp->ccb = ccb;
1917
1918	if (0 == csio->dxfer_len) {
1919		return (0);
1920	}
1921
1922	switch (ccb->ccb_h.flags & CAM_DATA_MASK) {
1923	case CAM_DATA_BIO:
1924	case CAM_DATA_VADDR:
1925		error = bus_dmamap_load_ccb(reqp->softc->storvsc_req_dtag,
1926		    reqp->data_dmap, ccb, storvsc_xferbuf_prepare, reqp,
1927		    BUS_DMA_NOWAIT);
1928		if (error) {
1929			xpt_print(ccb->ccb_h.path,
1930			    "bus_dmamap_load_ccb failed: %d\n", error);
1931			return (error);
1932		}
1933		if ((ccb->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO)
1934			reqp->softc->sysctl_data.data_bio_cnt++;
1935		else
1936			reqp->softc->sysctl_data.data_vaddr_cnt++;
1937		break;
1938
1939	case CAM_DATA_SG:
1940	{
1941		struct storvsc_gpa_range *prplist;
1942		int i = 0;
1943		int offset = 0;
1944		int ret;
1945
1946		bus_dma_segment_t *storvsc_sglist =
1947		    (bus_dma_segment_t *)ccb->csio.data_ptr;
1948		u_int16_t storvsc_sg_count = ccb->csio.sglist_cnt;
1949
1950		prplist = &reqp->prp_list;
1951		prplist->gpa_range.gpa_len = csio->dxfer_len;
1952
1953		printf("Storvsc: get SG I/O operation, %d\n",
1954		    reqp->vstor_packet.u.vm_srb.data_in);
1955
1956		if (storvsc_sg_count > STORVSC_DATA_SEGCNT_MAX){
1957			printf("Storvsc: %d segments is too much, "
1958			    "only support %d segments\n",
1959			    storvsc_sg_count, STORVSC_DATA_SEGCNT_MAX);
1960			return (EINVAL);
1961		}
1962
1963		/*
1964		 * We create our own bounce buffer function currently. Idealy
1965		 * we should use BUS_DMA(9) framework. But with current BUS_DMA
1966		 * code there is no callback API to check the page alignment of
1967		 * middle segments before busdma can decide if a bounce buffer
1968		 * is needed for particular segment. There is callback,
1969		 * "bus_dma_filter_t *filter", but the parrameters are not
1970		 * sufficient for storvsc driver.
1971		 * TODO:
1972		 *	Add page alignment check in BUS_DMA(9) callback. Once
1973		 *	this is complete, switch the following code to use
1974		 *	BUS_DMA(9) for storvsc bounce buffer support.
1975		 */
1976		/* check if we need to create bounce buffer */
1977		ret = storvsc_check_bounce_buffer_sgl(storvsc_sglist,
1978		    storvsc_sg_count, &not_aligned_seg_bits);
1979		if (ret != -1) {
1980			reqp->bounce_sgl =
1981			    storvsc_create_bounce_buffer(storvsc_sg_count,
1982			    reqp->vstor_packet.u.vm_srb.data_in);
1983			if (NULL == reqp->bounce_sgl) {
1984				printf("Storvsc_error: "
1985				    "create bounce buffer failed.\n");
1986				return (ENOMEM);
1987			}
1988
1989			reqp->bounce_sgl_count = storvsc_sg_count;
1990			reqp->not_aligned_seg_bits = not_aligned_seg_bits;
1991
1992			/*
1993			 * if it is write, we need copy the original data
1994			 *to bounce buffer
1995			 */
1996			if (WRITE_TYPE == reqp->vstor_packet.u.vm_srb.data_in) {
1997				storvsc_copy_sgl_to_bounce_buf(
1998				    reqp->bounce_sgl,
1999				    storvsc_sglist,
2000				    storvsc_sg_count,
2001				    reqp->not_aligned_seg_bits);
2002			}
2003
2004			/* transfer virtual address to physical frame number */
2005			if (reqp->not_aligned_seg_bits & 0x1){
2006 				phys_addr =
2007				    vtophys(reqp->bounce_sgl->sg_segs[0].ss_paddr);
2008			}else{
2009 				phys_addr =
2010					vtophys(storvsc_sglist[0].ds_addr);
2011			}
2012			prplist->gpa_range.gpa_ofs = phys_addr & PAGE_MASK;
2013
2014			pfn = phys_addr >> PAGE_SHIFT;
2015			prplist->gpa_page[0] = pfn;
2016
2017			for (i = 1; i < storvsc_sg_count; i++) {
2018				if (reqp->not_aligned_seg_bits & (1 << i)) {
2019					phys_addr =
2020					    vtophys(reqp->bounce_sgl->sg_segs[i].ss_paddr);
2021				} else {
2022					phys_addr =
2023					    vtophys(storvsc_sglist[i].ds_addr);
2024				}
2025
2026				pfn = phys_addr >> PAGE_SHIFT;
2027				prplist->gpa_page[i] = pfn;
2028			}
2029			reqp->prp_cnt = i;
2030		} else {
2031			phys_addr = vtophys(storvsc_sglist[0].ds_addr);
2032
2033			prplist->gpa_range.gpa_ofs = phys_addr & PAGE_MASK;
2034
2035			for (i = 0; i < storvsc_sg_count; i++) {
2036				phys_addr = vtophys(storvsc_sglist[i].ds_addr);
2037				pfn = phys_addr >> PAGE_SHIFT;
2038				prplist->gpa_page[i] = pfn;
2039			}
2040			reqp->prp_cnt = i;
2041
2042			/* check the last segment cross boundary or not */
2043			offset = phys_addr & PAGE_MASK;
2044			if (offset) {
2045				/* Add one more PRP entry */
2046				phys_addr =
2047				    vtophys(storvsc_sglist[i-1].ds_addr +
2048				    PAGE_SIZE - offset);
2049				pfn = phys_addr >> PAGE_SHIFT;
2050				prplist->gpa_page[i] = pfn;
2051				reqp->prp_cnt++;
2052			}
2053
2054			reqp->bounce_sgl_count = 0;
2055		}
2056		reqp->softc->sysctl_data.data_sg_cnt++;
2057		break;
2058	}
2059	default:
2060		printf("Unknow flags: %d\n", ccb->ccb_h.flags);
2061		return(EINVAL);
2062	}
2063
2064	return(0);
2065}
2066
2067static uint32_t
2068is_scsi_valid(const struct scsi_inquiry_data *inq_data)
2069{
2070	u_int8_t type;
2071
2072	type = SID_TYPE(inq_data);
2073	if (type == T_NODEVICE)
2074		return (0);
2075	if (SID_QUAL(inq_data) == SID_QUAL_BAD_LU)
2076		return (0);
2077	return (1);
2078}
2079
2080/**
2081 * @brief completion function before returning to CAM
2082 *
2083 * I/O process has been completed and the result needs
2084 * to be passed to the CAM layer.
2085 * Free resources related to this request.
2086 *
2087 * @param reqp pointer to a request structure
2088 */
2089static void
2090storvsc_io_done(struct hv_storvsc_request *reqp)
2091{
2092	union ccb *ccb = reqp->ccb;
2093	struct ccb_scsiio *csio = &ccb->csio;
2094	struct storvsc_softc *sc = reqp->softc;
2095	struct vmscsi_req *vm_srb = &reqp->vstor_packet.u.vm_srb;
2096	bus_dma_segment_t *ori_sglist = NULL;
2097	int ori_sg_count = 0;
2098
2099	/* destroy bounce buffer if it is used */
2100	if (reqp->bounce_sgl_count) {
2101		ori_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr;
2102		ori_sg_count = ccb->csio.sglist_cnt;
2103
2104		/*
2105		 * If it is READ operation, we should copy back the data
2106		 * to original SG list.
2107		 */
2108		if (READ_TYPE == reqp->vstor_packet.u.vm_srb.data_in) {
2109			storvsc_copy_from_bounce_buf_to_sgl(ori_sglist,
2110			    ori_sg_count,
2111			    reqp->bounce_sgl,
2112			    reqp->not_aligned_seg_bits);
2113		}
2114
2115		storvsc_destroy_bounce_buffer(reqp->bounce_sgl);
2116		reqp->bounce_sgl_count = 0;
2117	}
2118
2119	if (reqp->retries > 0) {
2120		mtx_lock(&sc->hs_lock);
2121#if HVS_TIMEOUT_TEST
2122		xpt_print(ccb->ccb_h.path,
2123			"%u: IO returned after timeout, "
2124			"waking up timer handler if any.\n", ticks);
2125		mtx_lock(&reqp->event.mtx);
2126		cv_signal(&reqp->event.cv);
2127		mtx_unlock(&reqp->event.mtx);
2128#endif
2129		reqp->retries = 0;
2130		xpt_print(ccb->ccb_h.path,
2131			"%u: IO returned after timeout, "
2132			"stopping timer if any.\n", ticks);
2133		mtx_unlock(&sc->hs_lock);
2134	}
2135
2136#ifdef notyet
2137	/*
2138	 * callout_drain() will wait for the timer handler to finish
2139	 * if it is running. So we don't need any lock to synchronize
2140	 * between this routine and the timer handler.
2141	 * Note that we need to make sure reqp is not freed when timer
2142	 * handler is using or will use it.
2143	 */
2144	if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) {
2145		callout_drain(&reqp->callout);
2146	}
2147#endif
2148
2149	ccb->ccb_h.status &= ~CAM_SIM_QUEUED;
2150	ccb->ccb_h.status &= ~CAM_STATUS_MASK;
2151	if (vm_srb->scsi_status == SCSI_STATUS_OK) {
2152		const struct scsi_generic *cmd;
2153
2154		cmd = (const struct scsi_generic *)
2155		    ((ccb->ccb_h.flags & CAM_CDB_POINTER) ?
2156		     csio->cdb_io.cdb_ptr : csio->cdb_io.cdb_bytes);
2157		if (vm_srb->srb_status != SRB_STATUS_SUCCESS) {
2158			/*
2159			 * If there are errors, for example, invalid LUN,
2160			 * host will inform VM through SRB status.
2161			 */
2162			if (bootverbose) {
2163				if (vm_srb->srb_status == SRB_STATUS_INVALID_LUN) {
2164					xpt_print(ccb->ccb_h.path,
2165					    "invalid LUN %d for op: %s\n",
2166					    vm_srb->lun,
2167					    scsi_op_desc(cmd->opcode, NULL));
2168				} else {
2169					xpt_print(ccb->ccb_h.path,
2170					    "Unknown SRB flag: %d for op: %s\n",
2171					    vm_srb->srb_status,
2172					    scsi_op_desc(cmd->opcode, NULL));
2173				}
2174			}
2175
2176			/*
2177			 * XXX For a selection timeout, all of the LUNs
2178			 * on the target will be gone.  It works for SCSI
2179			 * disks, but does not work for IDE disks.
2180			 *
2181			 * For CAM_DEV_NOT_THERE, CAM will only get
2182			 * rid of the device(s) specified by the path.
2183			 */
2184			if (storvsc_get_storage_type(sc->hs_dev) ==
2185			    DRIVER_STORVSC)
2186				ccb->ccb_h.status |= CAM_SEL_TIMEOUT;
2187			else
2188				ccb->ccb_h.status |= CAM_DEV_NOT_THERE;
2189		} else {
2190			ccb->ccb_h.status |= CAM_REQ_CMP;
2191		}
2192
2193		if (cmd->opcode == INQUIRY &&
2194		    vm_srb->srb_status == SRB_STATUS_SUCCESS) {
2195			int resp_xfer_len, resp_buf_len, data_len;
2196			uint8_t *resp_buf = (uint8_t *)csio->data_ptr;
2197			struct scsi_inquiry_data *inq_data =
2198			    (struct scsi_inquiry_data *)csio->data_ptr;
2199
2200			/* Get the buffer length reported by host */
2201			resp_xfer_len = vm_srb->transfer_len;
2202
2203			/* Get the available buffer length */
2204			resp_buf_len = resp_xfer_len >= 5 ? resp_buf[4] + 5 : 0;
2205			data_len = (resp_buf_len < resp_xfer_len) ?
2206			    resp_buf_len : resp_xfer_len;
2207			if (bootverbose && data_len >= 5) {
2208				xpt_print(ccb->ccb_h.path, "storvsc inquiry "
2209				    "(%d) [%x %x %x %x %x ... ]\n", data_len,
2210				    resp_buf[0], resp_buf[1], resp_buf[2],
2211				    resp_buf[3], resp_buf[4]);
2212			}
2213			/*
2214			 * XXX: Manually fix the wrong response returned from WS2012
2215			 */
2216			if (!is_scsi_valid(inq_data) &&
2217			    (vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN8_1 ||
2218			    vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN8 ||
2219			    vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN7)) {
2220				if (data_len >= 4 &&
2221				    (resp_buf[2] == 0 || resp_buf[3] == 0)) {
2222					resp_buf[2] = 5; // verion=5 means SPC-3
2223					resp_buf[3] = 2; // resp fmt must be 2
2224					if (bootverbose)
2225						xpt_print(ccb->ccb_h.path,
2226						    "fix version and resp fmt for 0x%x\n",
2227						    vmstor_proto_version);
2228				}
2229			} else if (data_len >= SHORT_INQUIRY_LENGTH) {
2230				char vendor[16];
2231
2232				cam_strvis(vendor, inq_data->vendor,
2233				    sizeof(inq_data->vendor), sizeof(vendor));
2234				/*
2235				 * XXX: Upgrade SPC2 to SPC3 if host is WIN8 or
2236				 * WIN2012 R2 in order to support UNMAP feature.
2237				 */
2238				if (!strncmp(vendor, "Msft", 4) &&
2239				    SID_ANSI_REV(inq_data) == SCSI_REV_SPC2 &&
2240				    (vmstor_proto_version ==
2241				     VMSTOR_PROTOCOL_VERSION_WIN8_1 ||
2242				     vmstor_proto_version ==
2243				     VMSTOR_PROTOCOL_VERSION_WIN8)) {
2244					inq_data->version = SCSI_REV_SPC3;
2245					if (bootverbose) {
2246						xpt_print(ccb->ccb_h.path,
2247						    "storvsc upgrades "
2248						    "SPC2 to SPC3\n");
2249					}
2250				}
2251			}
2252		}
2253	} else {
2254		mtx_lock(&sc->hs_lock);
2255		xpt_print(ccb->ccb_h.path,
2256			"storvsc scsi_status = %d\n",
2257			vm_srb->scsi_status);
2258		mtx_unlock(&sc->hs_lock);
2259		ccb->ccb_h.status |= CAM_SCSI_STATUS_ERROR;
2260	}
2261
2262	ccb->csio.scsi_status = (vm_srb->scsi_status & 0xFF);
2263	ccb->csio.resid = ccb->csio.dxfer_len - vm_srb->transfer_len;
2264
2265	if (reqp->sense_info_len != 0) {
2266		csio->sense_resid = csio->sense_len - reqp->sense_info_len;
2267		ccb->ccb_h.status |= CAM_AUTOSNS_VALID;
2268	}
2269
2270	mtx_lock(&sc->hs_lock);
2271	if (reqp->softc->hs_frozen == 1) {
2272		xpt_print(ccb->ccb_h.path,
2273			"%u: storvsc unfreezing softc 0x%p.\n",
2274			ticks, reqp->softc);
2275		ccb->ccb_h.status |= CAM_RELEASE_SIMQ;
2276		reqp->softc->hs_frozen = 0;
2277	}
2278	storvsc_free_request(sc, reqp);
2279	mtx_unlock(&sc->hs_lock);
2280
2281	xpt_done_direct(ccb);
2282}
2283
2284/**
2285 * @brief Free a request structure
2286 *
2287 * Free a request structure by returning it to the free list
2288 *
2289 * @param sc pointer to a softc
2290 * @param reqp pointer to a request structure
2291 */
2292static void
2293storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp)
2294{
2295
2296	LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link);
2297}
2298
2299/**
2300 * @brief Determine type of storage device from GUID
2301 *
2302 * Using the type GUID, determine if this is a StorVSC (paravirtual
2303 * SCSI or BlkVSC (paravirtual IDE) device.
2304 *
2305 * @param dev a device
2306 * returns an enum
2307 */
2308static enum hv_storage_type
2309storvsc_get_storage_type(device_t dev)
2310{
2311	device_t parent = device_get_parent(dev);
2312
2313	if (VMBUS_PROBE_GUID(parent, dev, &gBlkVscDeviceType) == 0)
2314		return DRIVER_BLKVSC;
2315	if (VMBUS_PROBE_GUID(parent, dev, &gStorVscDeviceType) == 0)
2316		return DRIVER_STORVSC;
2317	return DRIVER_UNKNOWN;
2318}
2319
2320#define	PCI_VENDOR_INTEL	0x8086
2321#define	PCI_PRODUCT_PIIX4	0x7111
2322
2323static void
2324storvsc_ada_probe_veto(void *arg __unused, struct cam_path *path,
2325    struct ata_params *ident_buf __unused, int *veto)
2326{
2327
2328	/*
2329	 * The ATA disks are shared with the controllers managed
2330	 * by this driver, so veto the ATA disks' attachment; the
2331	 * ATA disks will be attached as SCSI disks once this driver
2332	 * attached.
2333	 */
2334	if (path->device->protocol == PROTO_ATA) {
2335		struct ccb_pathinq cpi;
2336
2337		bzero(&cpi, sizeof(cpi));
2338		xpt_setup_ccb(&cpi.ccb_h, path, CAM_PRIORITY_NONE);
2339		cpi.ccb_h.func_code = XPT_PATH_INQ;
2340		xpt_action((union ccb *)&cpi);
2341		if (cpi.ccb_h.status == CAM_REQ_CMP &&
2342		    cpi.hba_vendor == PCI_VENDOR_INTEL &&
2343		    cpi.hba_device == PCI_PRODUCT_PIIX4) {
2344			(*veto)++;
2345			if (bootverbose) {
2346				xpt_print(path,
2347				    "Disable ATA disks on "
2348				    "simulated ATA controller (0x%04x%04x)\n",
2349				    cpi.hba_device, cpi.hba_vendor);
2350			}
2351		}
2352	}
2353}
2354
2355static void
2356storvsc_sysinit(void *arg __unused)
2357{
2358	if (vm_guest == VM_GUEST_HV) {
2359		storvsc_handler_tag = EVENTHANDLER_REGISTER(ada_probe_veto,
2360		    storvsc_ada_probe_veto, NULL, EVENTHANDLER_PRI_ANY);
2361	}
2362}
2363SYSINIT(storvsc_sys_init, SI_SUB_DRIVERS, SI_ORDER_SECOND, storvsc_sysinit,
2364    NULL);
2365
2366static void
2367storvsc_sysuninit(void *arg __unused)
2368{
2369	if (storvsc_handler_tag != NULL)
2370		EVENTHANDLER_DEREGISTER(ada_probe_veto, storvsc_handler_tag);
2371}
2372SYSUNINIT(storvsc_sys_uninit, SI_SUB_DRIVERS, SI_ORDER_SECOND,
2373    storvsc_sysuninit, NULL);
2374