1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26#ifndef	_VDC_H
27#define	_VDC_H
28
29/*
30 * Virtual disk client implementation definitions
31 */
32
33#include <sys/sysmacros.h>
34#include <sys/note.h>
35
36#include <sys/ldc.h>
37#include <sys/vio_mailbox.h>
38#include <sys/vdsk_mailbox.h>
39#include <sys/vdsk_common.h>
40
41#ifdef	__cplusplus
42extern "C" {
43#endif
44
45#define	VDC_DRIVER_NAME		"vdc"
46
47/*
48 * Bit-field values to indicate if parts of the vdc driver are initialised.
49 */
50#define	VDC_SOFT_STATE	0x0001
51#define	VDC_LOCKS	0x0002
52#define	VDC_MINOR	0x0004
53#define	VDC_THREAD	0x0008
54#define	VDC_DRING_INIT	0x0010	/* The DRing was created */
55#define	VDC_DRING_BOUND	0x0020	/* The DRing was bound to an LDC channel */
56#define	VDC_DRING_LOCAL	0x0040	/* The local private DRing was allocated */
57#define	VDC_DRING_ENTRY	0x0080	/* At least one DRing entry was initialised */
58#define	VDC_DRING	(VDC_DRING_INIT | VDC_DRING_BOUND |	\
59				VDC_DRING_LOCAL | VDC_DRING_ENTRY)
60#define	VDC_HANDSHAKE	0x0100	/* Indicates if a handshake is in progress */
61#define	VDC_HANDSHAKE_STOP	0x0200	/* stop further handshakes */
62
63/*
64 * Definitions of MD nodes/properties.
65 */
66#define	VDC_MD_CHAN_NAME		"channel-endpoint"
67#define	VDC_MD_VDEV_NAME		"virtual-device"
68#define	VDC_MD_PORT_NAME		"virtual-device-port"
69#define	VDC_MD_DISK_NAME		"disk"
70#define	VDC_MD_CFG_HDL			"cfg-handle"
71#define	VDC_MD_TIMEOUT			"vdc-timeout"
72#define	VDC_MD_ID			"id"
73
74/*
75 * Definition of actions to be carried out when processing the sequence ID
76 * of a message received from the vDisk server. The function verifying the
77 * sequence number checks the 'seq_num_xxx' fields in the soft state and
78 * returns whether the message should be processed (VDC_SEQ_NUM_TODO) or
79 * whether it was it was previously processed (VDC_SEQ_NUM_SKIP).
80 */
81#define	VDC_SEQ_NUM_INVALID		-1	/* Error */
82#define	VDC_SEQ_NUM_SKIP		0	/* Request already processed */
83#define	VDC_SEQ_NUM_TODO		1	/* Request needs processing */
84
85/*
86 * DRing reserved entries. Entry 0 is reserved and only used for error
87 * checking. This is done so that error checking can be done even if the
88 * DRing is full. All other entries are available for regular I/Os.
89 */
90#define	VDC_DRING_NUM_RESV		1	/* #reserved entries */
91#define	VDC_DRING_FIRST_RESV		0	/* 1st reserved entry */
92#define	VDC_DRING_FIRST_ENTRY		\
93	    (VDC_DRING_FIRST_RESV + VDC_DRING_NUM_RESV)	/* 1st non-resv entry */
94
95/*
96 * Flags for virtual disk operations.
97 */
98#define	VDC_OP_STATE_RUNNING	0x01	/* do operation in running state */
99#define	VDC_OP_ERRCHK_BACKEND	0x02	/* check backend on error */
100#define	VDC_OP_ERRCHK_CONFLICT	0x04	/* check resv conflict on error */
101#define	VDC_OP_DRING_RESERVED	0x08	/* use dring reserved entry */
102#define	VDC_OP_RESUBMIT		0x10	/* I/O is being resubmitted */
103
104#define	VDC_OP_ERRCHK	(VDC_OP_ERRCHK_BACKEND | VDC_OP_ERRCHK_CONFLICT)
105#define	VDC_OP_NORMAL	(VDC_OP_STATE_RUNNING | VDC_OP_ERRCHK)
106
107/*
108 * Macros to get UNIT and PART number
109 */
110#define	VDCUNIT_SHIFT	3
111#define	VDCPART_MASK	7
112
113#define	VDCUNIT(dev)	(getminor((dev)) >> VDCUNIT_SHIFT)
114#define	VDCPART(dev)	(getminor((dev)) &  VDCPART_MASK)
115
116/*
117 * Scheme to store the instance number and the slice number in the minor number.
118 * (NOTE: Uses the same format and definitions as the sd(7D) driver)
119 */
120#define	VD_MAKE_DEV(instance, minor)	((instance << VDCUNIT_SHIFT) | minor)
121
122#define	VDC_EFI_DEV_SET(dev, vdsk, ioctl)	\
123	VDSK_EFI_DEV_SET(dev, vdsk, ioctl,	\
124	    (vdsk)->vdisk_bsize, (vdsk)->vdisk_size)
125
126/* max number of handshake retries per server */
127#define	VDC_HSHAKE_RETRIES	3
128
129/* minimum number of attribute negotiations before handshake failure */
130#define	VDC_HATTR_MIN_INITIAL	3
131#define	VDC_HATTR_MIN		1
132
133/*
134 * This macro returns the number of Hz that the vdc driver should wait before
135 * a timeout is triggered. The 'timeout' parameter specifiecs the wait
136 * time in Hz. The 'mul' parameter allows for a multiplier to be
137 * specified allowing for a backoff to be implemented (e.g. using the
138 * retry number as a multiplier) where the wait time will get longer if
139 * there is no response on the previous retry.
140 */
141#define	VD_GET_TIMEOUT_HZ(timeout, mul)	\
142	(ddi_get_lbolt() + ((timeout) * MAX(1, (mul))))
143
144/*
145 * Macros to manipulate Descriptor Ring variables in the soft state
146 * structure.
147 */
148#define	VDC_GET_NEXT_REQ_ID(vdc)	((vdc)->req_id++)
149
150#define	VDC_GET_DRING_ENTRY_PTR(vdc, idx)	\
151		(vd_dring_entry_t *)(uintptr_t)((vdc)->dring_mem_info.vaddr + \
152			(idx * (vdc)->dring_entry_size))
153
154#define	VDC_MARK_DRING_ENTRY_FREE(vdc, idx)			\
155	{ \
156		vd_dring_entry_t *dep = NULL;				\
157		ASSERT(vdc != NULL);					\
158		ASSERT(idx < vdc->dring_len);		\
159		ASSERT(vdc->dring_mem_info.vaddr != NULL);		\
160		dep = (vd_dring_entry_t *)(uintptr_t)			\
161			(vdc->dring_mem_info.vaddr +	\
162			(idx * vdc->dring_entry_size));			\
163		ASSERT(dep != NULL);					\
164		dep->hdr.dstate = VIO_DESC_FREE;			\
165	}
166
167/* Initialise the Session ID and Sequence Num in the DRing msg */
168#define	VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdc)		\
169		ASSERT(vdc != NULL);			\
170		dmsg.tag.vio_sid = vdc->session_id;	\
171		dmsg.seq_num = vdc->seq_num;
172
173/*
174 * The states that the read thread can be in.
175 */
176typedef enum vdc_rd_state {
177	VDC_READ_IDLE,			/* idling - conn is not up */
178	VDC_READ_WAITING,		/* waiting for data */
179	VDC_READ_PENDING,		/* pending data avail for read */
180	VDC_READ_RESET			/* channel was reset - stop reads */
181} vdc_rd_state_t;
182
183/*
184 * The states that the vdc-vds connection can be in.
185 */
186typedef enum vdc_state {
187	VDC_STATE_INIT,			/* device is initialized */
188	VDC_STATE_INIT_WAITING,		/* waiting for ldc connection */
189	VDC_STATE_NEGOTIATE,		/* doing handshake negotiation */
190	VDC_STATE_HANDLE_PENDING,	/* handle requests in backup dring */
191	VDC_STATE_FAULTED,		/* multipath backend is inaccessible */
192	VDC_STATE_FAILED,		/* device is not usable */
193	VDC_STATE_RUNNING,		/* running and accepting requests */
194	VDC_STATE_DETACH,		/* detaching */
195	VDC_STATE_RESETTING		/* resetting connection with vds */
196} vdc_state_t;
197
198/*
199 * States of the service provided by a vds server
200 */
201typedef enum vdc_service_state {
202	VDC_SERVICE_NONE = -1, 		/* no state define */
203	VDC_SERVICE_OFFLINE,		/* no connection with the service */
204	VDC_SERVICE_CONNECTED,		/* connection established */
205	VDC_SERVICE_ONLINE,		/* connection and backend available */
206	VDC_SERVICE_FAILED,		/* connection failed */
207	VDC_SERVICE_FAULTED		/* connection but backend unavailable */
208} vdc_service_state_t;
209
210/*
211 * The states that the vdc instance can be in.
212 */
213typedef enum vdc_lc_state {
214	VDC_LC_ATTACHING,	/* driver is attaching */
215	VDC_LC_ONLINE_PENDING,	/* driver is attached, handshake pending */
216	VDC_LC_ONLINE,		/* driver is attached and online */
217	VDC_LC_DETACHING	/* driver is detaching */
218} vdc_lc_state_t;
219
220/*
221 * Local Descriptor Ring entry
222 *
223 * vdc creates a Local (private) descriptor ring the same size as the
224 * public descriptor ring it exports to vds.
225 */
226
227typedef enum {
228	VIO_read_dir,		/* read data from server */
229	VIO_write_dir,		/* write data to server */
230	VIO_both_dir		/* transfer both in and out in same buffer */
231} vio_desc_direction_t;
232
233typedef struct vdc_local_desc {
234	boolean_t		is_free;	/* local state - inuse or not */
235
236	int			operation;	/* VD_OP_xxx to be performed */
237	caddr_t			addr;		/* addr passed in by consumer */
238	int			slice;
239	diskaddr_t		offset;		/* disk offset */
240	size_t			nbytes;
241	struct buf		*buf;		/* buf of operation */
242	vio_desc_direction_t	dir;		/* direction of transfer */
243	int			flags;		/* flags of operation */
244
245	caddr_t			align_addr;	/* used if addr non-aligned */
246	ldc_mem_handle_t	desc_mhdl;	/* Mem handle of buf */
247	vd_dring_entry_t	*dep;		/* public Dring Entry Pointer */
248
249} vdc_local_desc_t;
250
251/*
252 * I/O queue used for checking backend or failfast
253 */
254typedef struct vdc_io {
255	struct vdc_io	*vio_next;	/* next pending I/O in the queue */
256	int		vio_index;	/* descriptor index */
257	clock_t		vio_qtime;	/* time the I/O was queued */
258} vdc_io_t;
259
260/*
261 * Per vDisk server channel states
262 */
263#define	VDC_LDC_INIT	0x0001
264#define	VDC_LDC_CB	0x0002
265#define	VDC_LDC_OPEN	0x0004
266#define	VDC_LDC		(VDC_LDC_INIT | VDC_LDC_CB | VDC_LDC_OPEN)
267
268/*
269 * vDisk server information
270 */
271typedef struct vdc_server {
272	struct vdc_server	*next;			/* Next server */
273	struct vdc		*vdcp;			/* Ptr to vdc struct */
274	uint64_t		id;			/* Server port id */
275	uint64_t		state;			/* Server state */
276	vdc_service_state_t	svc_state;		/* Service state */
277	vdc_service_state_t	log_state;		/* Last state logged */
278	uint64_t		ldc_id;			/* Server LDC id */
279	ldc_handle_t		ldc_handle;		/* Server LDC handle */
280	ldc_status_t		ldc_state;		/* Server LDC state */
281	uint64_t		ctimeout;		/* conn tmout (secs) */
282	uint_t			hshake_cnt;		/* handshakes count */
283	uint_t			hattr_cnt;		/* attr. neg. count */
284	uint_t			hattr_total;		/* attr. neg. total */
285} vdc_server_t;
286
287/*
288 * vdc soft state structure
289 */
290typedef struct vdc {
291
292	kmutex_t	lock;		/* protects next 2 sections of vars */
293	kcondvar_t	running_cv;	/* signal when upper layers can send */
294	kcondvar_t	initwait_cv;	/* signal when ldc conn is up */
295	kcondvar_t	dring_free_cv;	/* signal when desc is avail */
296	kcondvar_t	membind_cv;	/* signal when mem can be bound */
297	boolean_t	self_reset;	/* self initiated reset */
298	kcondvar_t	io_pending_cv;	/* signal on pending I/O */
299	boolean_t	io_pending;	/* pending I/O */
300
301	int		initialized;	/* keeps track of what's init'ed */
302	vdc_lc_state_t	lifecycle;	/* Current state of the vdc instance */
303	uint_t		hattr_min;	/* min. # attribute negotiations */
304
305	uint8_t		open[OTYPCNT];	/* mask of opened slices */
306	uint8_t		open_excl;	/* mask of exclusively opened slices */
307	ulong_t		open_lyr[V_NUMPAR]; /* number of layered opens */
308	int		dkio_flush_pending; /* # outstanding DKIO flushes */
309	int		validate_pending; /* # outstanding validate request */
310	vd_disk_label_t vdisk_label; 	/* label type of device/disk imported */
311	struct extvtoc	*vtoc;		/* structure to store VTOC data */
312	struct dk_geom	*geom;		/* structure to store geometry data */
313	vd_slice_t	slice[V_NUMPAR]; /* logical partitions */
314
315	kthread_t	*msg_proc_thr;	/* main msg processing thread */
316
317	kmutex_t	read_lock;	/* lock to protect read */
318	kcondvar_t	read_cv;	/* cv to wait for READ events */
319	vdc_rd_state_t	read_state;	/* current read state */
320
321	uint32_t	sync_op_cnt;	/* num of active sync operations */
322	boolean_t	sync_op_blocked; /* blocked waiting to do sync op */
323	kcondvar_t	sync_blocked_cv; /* cv wait for other syncs to finish */
324
325	uint64_t	session_id;	/* common ID sent with all messages */
326	uint64_t	seq_num;	/* most recent sequence num generated */
327	uint64_t	seq_num_reply;	/* Last seq num ACK/NACK'ed by vds */
328	uint64_t	req_id;		/* Most recent Request ID generated */
329	uint64_t	req_id_proc;	/* Last request ID processed by vdc */
330	vdc_state_t	state;		/* Current disk client-server state */
331
332	dev_info_t	*dip;		/* device info pointer */
333	int		instance;	/* driver instance number */
334
335	vio_ver_t	ver;		/* version number agreed with server */
336	vd_disk_type_t	vdisk_type;	/* type of device/disk being imported */
337	uint32_t	vdisk_media;	/* physical media type of vDisk */
338	uint64_t	vdisk_size;	/* device size in blocks */
339	uint64_t	max_xfer_sz;	/* maximum block size of a descriptor */
340	uint64_t	vdisk_bsize;	/* blk size for the virtual disk */
341	uint32_t	vio_bmask;	/* mask to check vio blk alignment */
342	int		vio_bshift;	/* shift for vio blk conversion */
343	uint64_t	operations;	/* bitmask of ops. server supports */
344	struct dk_cinfo	*cinfo;		/* structure to store DKIOCINFO data */
345	struct dk_minfo	*minfo;		/* structure for DKIOCGMEDIAINFO data */
346	ddi_devid_t	devid;		/* device id */
347	boolean_t	ctimeout_reached; /* connection timeout has expired */
348
349	/*
350	 * The ownership fields are protected by the lock mutex. The
351	 * ownership_lock mutex is used to serialize ownership operations;
352	 * it should be acquired before the lock mutex.
353	 */
354	kmutex_t	ownership_lock;		/* serialize ownership ops */
355	int		ownership;		/* ownership status flags */
356	kthread_t	*ownership_thread;	/* ownership thread */
357	kcondvar_t	ownership_cv;		/* cv for ownership update */
358
359	/*
360	 * The eio and failfast fields are protected by the lock mutex.
361	 */
362	kthread_t	*eio_thread;		/* error io thread */
363	kcondvar_t	eio_cv;			/* cv for eio thread update */
364	vdc_io_t	*eio_queue;		/* error io queue */
365	clock_t		failfast_interval;	/* interval in microsecs */
366
367	/*
368	 * kstats used to store I/O statistics consumed by iostat(1M).
369	 * These are protected by the lock mutex.
370	 */
371	kstat_t		*io_stats;
372	kstat_t		*err_stats;
373
374	ldc_dring_handle_t	dring_hdl;		/* dring handle */
375	ldc_mem_info_t		dring_mem_info;		/* dring information */
376	uint_t			dring_curr_idx;		/* current index */
377	uint32_t		dring_len;		/* dring length */
378	uint32_t		dring_max_cookies;	/* dring max cookies */
379	uint32_t		dring_cookie_count;	/* num cookies */
380	uint32_t		dring_entry_size;	/* descriptor size */
381	ldc_mem_cookie_t 	*dring_cookie;		/* dring cookies */
382	uint64_t		dring_ident;		/* dring ident */
383
384	uint64_t		threads_pending; 	/* num of threads */
385
386	vdc_local_desc_t	*local_dring;		/* local dring */
387	vdc_local_desc_t	*local_dring_backup;	/* local dring backup */
388	int			local_dring_backup_tail; /* backup dring tail */
389	int			local_dring_backup_len;	/* backup dring len */
390
391	int			num_servers;		/* no. of servers */
392	vdc_server_t		*server_list;		/* vdisk server list */
393	vdc_server_t		*curr_server;		/* curr vdisk server */
394} vdc_t;
395
396/*
397 * Ownership status flags
398 */
399#define	VDC_OWNERSHIP_NONE	0x00 /* no ownership wanted */
400#define	VDC_OWNERSHIP_WANTED	0x01 /* ownership is wanted */
401#define	VDC_OWNERSHIP_GRANTED	0x02 /* ownership has been granted */
402#define	VDC_OWNERSHIP_RESET	0x04 /* ownership has been reset */
403
404/*
405 * Reservation conflict panic message
406 */
407#define	VDC_RESV_CONFLICT_FMT_STR	"Reservation Conflict\nDisk: "
408#define	VDC_RESV_CONFLICT_FMT_LEN 	(sizeof (VDC_RESV_CONFLICT_FMT_STR))
409
410/*
411 * Debugging macros
412 */
413#ifdef DEBUG
414extern int	vdc_msglevel;
415extern uint64_t	vdc_matchinst;
416
417#define	DMSG(_vdc, err_level, format, ...)				\
418	do {								\
419		if (vdc_msglevel > err_level &&				\
420		(vdc_matchinst & (1ull << (_vdc)->instance)))		\
421			cmn_err(CE_CONT, "?[%d,t@%p] %s: "format,	\
422			(_vdc)->instance, (void *)curthread,		\
423			__func__, __VA_ARGS__);				\
424		_NOTE(CONSTANTCONDITION)				\
425	} while (0);
426
427#define	DMSGX(err_level, format, ...)					\
428	do {								\
429		if (vdc_msglevel > err_level)				\
430			cmn_err(CE_CONT, "?%s: "format, __func__, __VA_ARGS__);\
431		_NOTE(CONSTANTCONDITION)				\
432	} while (0);
433
434#define	VDC_DUMP_DRING_MSG(dmsgp)					\
435		DMSGX(0, "sq:%lu start:%d end:%d ident:%lu\n",		\
436			dmsgp->seq_num, dmsgp->start_idx,		\
437			dmsgp->end_idx, dmsgp->dring_ident);
438
439#else	/* !DEBUG */
440#define	DMSG(err_level, ...)
441#define	DMSGX(err_level, format, ...)
442#define	VDC_DUMP_DRING_MSG(dmsgp)
443
444#endif	/* !DEBUG */
445
446#ifdef	__cplusplus
447}
448#endif
449
450#endif	/* _VDC_H */
451