1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23269416Sdelphij * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
24168404Spjd */
25168404Spjd
26168404Spjd#ifndef _SYS_VDEV_IMPL_H
27168404Spjd#define	_SYS_VDEV_IMPL_H
28168404Spjd
29168404Spjd#include <sys/avl.h>
30168404Spjd#include <sys/dmu.h>
31168404Spjd#include <sys/metaslab.h>
32168404Spjd#include <sys/nvpair.h>
33168404Spjd#include <sys/space_map.h>
34168404Spjd#include <sys/vdev.h>
35168404Spjd#include <sys/dkio.h>
36168404Spjd#include <sys/uberblock_impl.h>
37168404Spjd
38168404Spjd#ifdef	__cplusplus
39168404Spjdextern "C" {
40168404Spjd#endif
41168404Spjd
42168404Spjd/*
43168404Spjd * Virtual device descriptors.
44168404Spjd *
45168404Spjd * All storage pool operations go through the virtual device framework,
46168404Spjd * which provides data replication and I/O scheduling.
47168404Spjd */
48168404Spjd
49168404Spjd/*
50168404Spjd * Forward declarations that lots of things need.
51168404Spjd */
52168404Spjdtypedef struct vdev_queue vdev_queue_t;
53168404Spjdtypedef struct vdev_cache vdev_cache_t;
54168404Spjdtypedef struct vdev_cache_entry vdev_cache_entry_t;
55168404Spjd
56168404Spjd/*
57168404Spjd * Virtual device operations
58168404Spjd */
59236155Smmtypedef int	vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
60254591Sgibbs    uint64_t *logical_ashift, uint64_t *physical_ashift);
61168404Spjdtypedef void	vdev_close_func_t(vdev_t *vd);
62168404Spjdtypedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
63185029Spjdtypedef int	vdev_io_start_func_t(zio_t *zio);
64168404Spjdtypedef void	vdev_io_done_func_t(zio_t *zio);
65168404Spjdtypedef void	vdev_state_change_func_t(vdev_t *vd, int, int);
66219089Spjdtypedef void	vdev_hold_func_t(vdev_t *vd);
67219089Spjdtypedef void	vdev_rele_func_t(vdev_t *vd);
68168404Spjd
69168404Spjdtypedef struct vdev_ops {
70168404Spjd	vdev_open_func_t		*vdev_op_open;
71168404Spjd	vdev_close_func_t		*vdev_op_close;
72168404Spjd	vdev_asize_func_t		*vdev_op_asize;
73168404Spjd	vdev_io_start_func_t		*vdev_op_io_start;
74168404Spjd	vdev_io_done_func_t		*vdev_op_io_done;
75168404Spjd	vdev_state_change_func_t	*vdev_op_state_change;
76219089Spjd	vdev_hold_func_t		*vdev_op_hold;
77219089Spjd	vdev_rele_func_t		*vdev_op_rele;
78168404Spjd	char				vdev_op_type[16];
79168404Spjd	boolean_t			vdev_op_leaf;
80168404Spjd} vdev_ops_t;
81168404Spjd
82168404Spjd/*
83168404Spjd * Virtual device properties
84168404Spjd */
85168404Spjdstruct vdev_cache_entry {
86168404Spjd	char		*ve_data;
87168404Spjd	uint64_t	ve_offset;
88168404Spjd	uint64_t	ve_lastused;
89168404Spjd	avl_node_t	ve_offset_node;
90168404Spjd	avl_node_t	ve_lastused_node;
91168404Spjd	uint32_t	ve_hits;
92168404Spjd	uint16_t	ve_missed_update;
93168404Spjd	zio_t		*ve_fill_io;
94168404Spjd};
95168404Spjd
96168404Spjdstruct vdev_cache {
97168404Spjd	avl_tree_t	vc_offset_tree;
98168404Spjd	avl_tree_t	vc_lastused_tree;
99168404Spjd	kmutex_t	vc_lock;
100168404Spjd};
101168404Spjd
102260763Savgtypedef struct vdev_queue_class {
103260763Savg	uint32_t	vqc_active;
104260763Savg
105260763Savg	/*
106260763Savg	 * Sorted by offset or timestamp, depending on if the queue is
107260763Savg	 * LBA-ordered vs FIFO.
108260763Savg	 */
109260763Savg	avl_tree_t	vqc_queued_tree;
110260763Savg} vdev_queue_class_t;
111260763Savg
112168404Spjdstruct vdev_queue {
113260763Savg	vdev_t		*vq_vdev;
114260763Savg	vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
115260763Savg	avl_tree_t	vq_active_tree;
116260763Savg	uint64_t	vq_last_offset;
117260763Savg	hrtime_t	vq_io_complete_ts; /* time last i/o completed */
118168404Spjd	kmutex_t	vq_lock;
119271238Ssmh	uint64_t	vq_lastoffset;
120168404Spjd};
121168404Spjd
122168404Spjd/*
123168404Spjd * Virtual device descriptor
124168404Spjd */
125168404Spjdstruct vdev {
126168404Spjd	/*
127168404Spjd	 * Common to all vdev types.
128168404Spjd	 */
129168404Spjd	uint64_t	vdev_id;	/* child number in vdev parent	*/
130168404Spjd	uint64_t	vdev_guid;	/* unique ID for this vdev	*/
131168404Spjd	uint64_t	vdev_guid_sum;	/* self guid + all child guids	*/
132219089Spjd	uint64_t	vdev_orig_guid;	/* orig. guid prior to remove	*/
133168404Spjd	uint64_t	vdev_asize;	/* allocatable device capacity	*/
134219089Spjd	uint64_t	vdev_min_asize;	/* min acceptable asize		*/
135236155Smm	uint64_t	vdev_max_asize;	/* max acceptable asize		*/
136168404Spjd	uint64_t	vdev_ashift;	/* block alignment shift	*/
137254591Sgibbs	/*
138254591Sgibbs	 * Logical block alignment shift
139254591Sgibbs	 *
140254591Sgibbs	 * The smallest sized/aligned I/O supported by the device.
141254591Sgibbs	 */
142254591Sgibbs	uint64_t        vdev_logical_ashift;
143254591Sgibbs	/*
144254591Sgibbs	 * Physical block alignment shift
145254591Sgibbs	 *
146254591Sgibbs	 * The device supports logical I/Os with vdev_logical_ashift
147254591Sgibbs	 * size/alignment, but optimum performance will be achieved by
148254591Sgibbs	 * aligning/sizing requests to vdev_physical_ashift.  Smaller
149254591Sgibbs	 * requests may be inflated or incur device level read-modify-write
150254591Sgibbs	 * operations.
151254591Sgibbs	 *
152254591Sgibbs	 * May be 0 to indicate no preference (i.e. use vdev_logical_ashift).
153254591Sgibbs         */
154254591Sgibbs	uint64_t        vdev_physical_ashift;
155168404Spjd	uint64_t	vdev_state;	/* see VDEV_STATE_* #defines	*/
156168404Spjd	uint64_t	vdev_prevstate;	/* used when reopening a vdev	*/
157168404Spjd	vdev_ops_t	*vdev_ops;	/* vdev operations		*/
158168404Spjd	spa_t		*vdev_spa;	/* spa for this vdev		*/
159168404Spjd	void		*vdev_tsd;	/* type-specific data		*/
160219089Spjd	vnode_t		*vdev_name_vp;	/* vnode for pathname		*/
161219089Spjd	vnode_t		*vdev_devid_vp;	/* vnode for devid		*/
162168404Spjd	vdev_t		*vdev_top;	/* top-level vdev		*/
163168404Spjd	vdev_t		*vdev_parent;	/* parent vdev			*/
164168404Spjd	vdev_t		**vdev_child;	/* array of children		*/
165168404Spjd	uint64_t	vdev_children;	/* number of children		*/
166168404Spjd	vdev_stat_t	vdev_stat;	/* virtual device statistics	*/
167219089Spjd	boolean_t	vdev_expanding;	/* expand the vdev?		*/
168219089Spjd	boolean_t	vdev_reopening;	/* reopen in progress?		*/
169219089Spjd	int		vdev_open_error; /* error on last open		*/
170219089Spjd	kthread_t	*vdev_open_thread; /* thread opening children	*/
171219089Spjd	uint64_t	vdev_crtxg;	/* txg when top-level was added */
172168404Spjd
173168404Spjd	/*
174168404Spjd	 * Top-level vdev state.
175168404Spjd	 */
176168404Spjd	uint64_t	vdev_ms_array;	/* metaslab array object	*/
177168404Spjd	uint64_t	vdev_ms_shift;	/* metaslab size shift		*/
178168404Spjd	uint64_t	vdev_ms_count;	/* number of metaslabs		*/
179168404Spjd	metaslab_group_t *vdev_mg;	/* metaslab group		*/
180168404Spjd	metaslab_t	**vdev_ms;	/* metaslab array		*/
181168404Spjd	txg_list_t	vdev_ms_list;	/* per-txg dirty metaslab lists	*/
182168404Spjd	txg_list_t	vdev_dtl_list;	/* per-txg dirty DTL lists	*/
183168404Spjd	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
184185029Spjd	boolean_t	vdev_remove_wanted; /* async remove wanted?	*/
185185029Spjd	boolean_t	vdev_probe_wanted; /* async probe wanted?	*/
186185029Spjd	list_node_t	vdev_config_dirty_node; /* config dirty list	*/
187185029Spjd	list_node_t	vdev_state_dirty_node; /* state dirty list	*/
188168404Spjd	uint64_t	vdev_deflate_ratio; /* deflation ratio (x512)	*/
189185029Spjd	uint64_t	vdev_islog;	/* is an intent log device	*/
190262093Savg	uint64_t	vdev_removing;	/* device is being removed?	*/
191262093Savg	boolean_t	vdev_ishole;	/* is a hole in the namespace 	*/
192168404Spjd
193168404Spjd	/*
194168404Spjd	 * Leaf vdev state.
195168404Spjd	 */
196262093Savg	range_tree_t	*vdev_dtl[DTL_TYPES]; /* dirty time logs	*/
197262093Savg	space_map_t	*vdev_dtl_sm;	/* dirty time log space map	*/
198262093Savg	txg_node_t	vdev_dtl_node;	/* per-txg dirty DTL linkage	*/
199262093Savg	uint64_t	vdev_dtl_object; /* DTL object			*/
200168404Spjd	uint64_t	vdev_psize;	/* physical device capacity	*/
201168404Spjd	uint64_t	vdev_wholedisk;	/* true if this is a whole disk */
202185029Spjd	uint64_t	vdev_offline;	/* persistent offline state	*/
203185029Spjd	uint64_t	vdev_faulted;	/* persistent faulted state	*/
204185029Spjd	uint64_t	vdev_degraded;	/* persistent degraded state	*/
205185029Spjd	uint64_t	vdev_removed;	/* persistent removed state	*/
206254112Sdelphij	uint64_t	vdev_resilver_txg; /* persistent resilvering state */
207168404Spjd	uint64_t	vdev_nparity;	/* number of parity devices for raidz */
208168404Spjd	char		*vdev_path;	/* vdev path (if any)		*/
209168404Spjd	char		*vdev_devid;	/* vdev devid (if any)		*/
210185029Spjd	char		*vdev_physpath;	/* vdev device path (if any)	*/
211209962Smm	char		*vdev_fru;	/* physical FRU location	*/
212185029Spjd	uint64_t	vdev_not_present; /* not present during import	*/
213185029Spjd	uint64_t	vdev_unspare;	/* unspare when resilvering done */
214185029Spjd	boolean_t	vdev_nowritecache; /* true if flushwritecache failed */
215240868Spjd	boolean_t	vdev_notrim;	/* true if trim failed */
216185029Spjd	boolean_t	vdev_checkremove; /* temporary online test	*/
217185029Spjd	boolean_t	vdev_forcefault; /* force online fault		*/
218219089Spjd	boolean_t	vdev_splitting;	/* split or repair in progress  */
219219089Spjd	boolean_t	vdev_delayed_close; /* delayed device close?	*/
220262093Savg	boolean_t	vdev_tmpoffline; /* device taken offline temporarily? */
221262093Savg	boolean_t	vdev_detached;	/* device detached?		*/
222262093Savg	boolean_t	vdev_cant_read;	/* vdev is failing all reads	*/
223262093Savg	boolean_t	vdev_cant_write; /* vdev is failing all writes	*/
224262093Savg	boolean_t	vdev_isspare;	/* was a hot spare		*/
225262093Savg	boolean_t	vdev_isl2cache;	/* was a l2cache device		*/
226168404Spjd	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
227168404Spjd	vdev_cache_t	vdev_cache;	/* physical block cache		*/
228185029Spjd	spa_aux_vdev_t	*vdev_aux;	/* for l2cache vdevs		*/
229185029Spjd	zio_t		*vdev_probe_zio; /* root of current probe	*/
230219089Spjd	vdev_aux_t	vdev_label_aux;	/* on-disk aux state		*/
231271238Ssmh	struct trim_map	*vdev_trimmap;	/* map on outstanding trims	*/
232271238Ssmh	uint16_t	vdev_rotation_rate; /* rotational rate of the media */
233271238Ssmh#define	VDEV_RATE_UNKNOWN	0
234271238Ssmh#define	VDEV_RATE_NON_ROTATING	1
235168404Spjd
236168404Spjd	/*
237168404Spjd	 * For DTrace to work in userland (libzpool) context, these fields must
238168404Spjd	 * remain at the end of the structure.  DTrace will use the kernel's
239168404Spjd	 * CTF definition for 'struct vdev', and since the size of a kmutex_t is
240236884Smm	 * larger in userland, the offsets for the rest of the fields would be
241168404Spjd	 * incorrect.
242168404Spjd	 */
243168404Spjd	kmutex_t	vdev_dtl_lock;	/* vdev_dtl_{map,resilver}	*/
244168404Spjd	kmutex_t	vdev_stat_lock;	/* vdev_stat			*/
245185029Spjd	kmutex_t	vdev_probe_lock; /* protects vdev_probe_zio	*/
246168404Spjd};
247168404Spjd
248219089Spjd#define	VDEV_RAIDZ_MAXPARITY	3
249219089Spjd
250209962Smm#define	VDEV_PAD_SIZE		(8 << 10)
251209962Smm/* 2 padding areas (vl_pad1 and vl_pad2) to skip */
252209962Smm#define	VDEV_SKIP_SIZE		VDEV_PAD_SIZE * 2
253168404Spjd#define	VDEV_PHYS_SIZE		(112 << 10)
254168404Spjd#define	VDEV_UBERBLOCK_RING	(128 << 10)
255168404Spjd
256269416Sdelphij/* The largest uberblock we support is 8k. */
257269416Sdelphij#define	MAX_UBERBLOCK_SHIFT (13)
258168404Spjd#define	VDEV_UBERBLOCK_SHIFT(vd)	\
259269416Sdelphij	MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
260269416Sdelphij	    MAX_UBERBLOCK_SHIFT)
261168404Spjd#define	VDEV_UBERBLOCK_COUNT(vd)	\
262168404Spjd	(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
263168404Spjd#define	VDEV_UBERBLOCK_OFFSET(vd, n)	\
264168404Spjd	offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
265168404Spjd#define	VDEV_UBERBLOCK_SIZE(vd)		(1ULL << VDEV_UBERBLOCK_SHIFT(vd))
266168404Spjd
267168404Spjdtypedef struct vdev_phys {
268219089Spjd	char		vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
269219089Spjd	zio_eck_t	vp_zbt;
270168404Spjd} vdev_phys_t;
271168404Spjd
272168404Spjdtypedef struct vdev_label {
273209962Smm	char		vl_pad1[VDEV_PAD_SIZE];			/*  8K */
274209962Smm	char		vl_pad2[VDEV_PAD_SIZE];			/*  8K */
275168404Spjd	vdev_phys_t	vl_vdev_phys;				/* 112K	*/
276168404Spjd	char		vl_uberblock[VDEV_UBERBLOCK_RING];	/* 128K	*/
277168404Spjd} vdev_label_t;							/* 256K total */
278168404Spjd
279168404Spjd/*
280168404Spjd * vdev_dirty() flags
281168404Spjd */
282168404Spjd#define	VDD_METASLAB	0x01
283168404Spjd#define	VDD_DTL		0x02
284168404Spjd
285251631Sdelphij/* Offset of embedded boot loader region on each label */
286251631Sdelphij#define	VDEV_BOOT_OFFSET	(2 * sizeof (vdev_label_t))
287168404Spjd/*
288251631Sdelphij * Size of embedded boot loader region on each label.
289168404Spjd * The total size of the first two labels plus the boot area is 4MB.
290168404Spjd */
291251631Sdelphij#define	VDEV_BOOT_SIZE		(7ULL << 19)			/* 3.5M */
292168404Spjd
293168404Spjd/*
294168404Spjd * Size of label regions at the start and end of each leaf device.
295168404Spjd */
296168404Spjd#define	VDEV_LABEL_START_SIZE	(2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
297168404Spjd#define	VDEV_LABEL_END_SIZE	(2 * sizeof (vdev_label_t))
298168404Spjd#define	VDEV_LABELS		4
299236884Smm#define	VDEV_BEST_LABEL		VDEV_LABELS
300168404Spjd
301168404Spjd#define	VDEV_ALLOC_LOAD		0
302168404Spjd#define	VDEV_ALLOC_ADD		1
303168404Spjd#define	VDEV_ALLOC_SPARE	2
304185029Spjd#define	VDEV_ALLOC_L2CACHE	3
305219089Spjd#define	VDEV_ALLOC_ROOTPOOL	4
306219089Spjd#define	VDEV_ALLOC_SPLIT	5
307230514Smm#define	VDEV_ALLOC_ATTACH	6
308168404Spjd
309168404Spjd/*
310168404Spjd * Allocate or free a vdev
311168404Spjd */
312219089Spjdextern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid,
313219089Spjd    vdev_ops_t *ops);
314168404Spjdextern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
315168404Spjd    vdev_t *parent, uint_t id, int alloctype);
316168404Spjdextern void vdev_free(vdev_t *vd);
317168404Spjd
318168404Spjd/*
319168404Spjd * Add or remove children and parents
320168404Spjd */
321168404Spjdextern void vdev_add_child(vdev_t *pvd, vdev_t *cvd);
322168404Spjdextern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd);
323168404Spjdextern void vdev_compact_children(vdev_t *pvd);
324168404Spjdextern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops);
325168404Spjdextern void vdev_remove_parent(vdev_t *cvd);
326168404Spjd
327168404Spjd/*
328168404Spjd * vdev sync load and sync
329168404Spjd */
330219089Spjdextern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
331219089Spjdextern boolean_t vdev_log_state_valid(vdev_t *vd);
332168404Spjdextern void vdev_load(vdev_t *vd);
333262093Savgextern int vdev_dtl_load(vdev_t *vd);
334168404Spjdextern void vdev_sync(vdev_t *vd, uint64_t txg);
335168404Spjdextern void vdev_sync_done(vdev_t *vd, uint64_t txg);
336168404Spjdextern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
337262093Savgextern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg);
338168404Spjd
339168404Spjd/*
340168404Spjd * Available vdev types.
341168404Spjd */
342168404Spjdextern vdev_ops_t vdev_root_ops;
343168404Spjdextern vdev_ops_t vdev_mirror_ops;
344168404Spjdextern vdev_ops_t vdev_replacing_ops;
345168404Spjdextern vdev_ops_t vdev_raidz_ops;
346168404Spjd#ifdef _KERNEL
347168404Spjdextern vdev_ops_t vdev_geom_ops;
348168404Spjd#else
349168404Spjdextern vdev_ops_t vdev_disk_ops;
350185029Spjd#endif
351168404Spjdextern vdev_ops_t vdev_file_ops;
352168404Spjdextern vdev_ops_t vdev_missing_ops;
353219089Spjdextern vdev_ops_t vdev_hole_ops;
354168404Spjdextern vdev_ops_t vdev_spare_ops;
355168404Spjd
356168404Spjd/*
357168404Spjd * Common size functions
358168404Spjd */
359168404Spjdextern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
360219089Spjdextern uint64_t vdev_get_min_asize(vdev_t *vd);
361219089Spjdextern void vdev_set_min_asize(vdev_t *vd);
362168404Spjd
363168404Spjd/*
364251631Sdelphij * Global variables
365168404Spjd */
366251631Sdelphij/* zdb uses this tunable, so it must be declared here to make lint happy. */
367168404Spjdextern int zfs_vdev_cache_size;
368168404Spjd
369263393Sdelphij#ifdef illumos
370263393Sdelphij/*
371263393Sdelphij * The vdev_buf_t is used to translate between zio_t and buf_t, and back again.
372263393Sdelphij */
373263393Sdelphijtypedef struct vdev_buf {
374263393Sdelphij	buf_t	vb_buf;		/* buffer that describes the io */
375263393Sdelphij	zio_t	*vb_io;		/* pointer back to the original zio_t */
376263393Sdelphij} vdev_buf_t;
377263393Sdelphij#endif
378263393Sdelphij
379168404Spjd#ifdef	__cplusplus
380168404Spjd}
381168404Spjd#endif
382168404Spjd
383168404Spjd#endif	/* _SYS_VDEV_IMPL_H */
384