1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#ifndef _SYS_MD_MDDB_H
27#define	_SYS_MD_MDDB_H
28
29#pragma ident	"%Z%%M%	%I%	%E% SMI"
30
31#include <sys/types.h>
32#include <sys/buf.h>
33
34#ifdef	__cplusplus
35extern "C" {
36#endif
37
38#if 0 /* DRP FOR DEBUGGING */
39#define	MDDB_FAKE
40#endif
41
42/* Private flags */
43#define	MD_PRV_GOTIT		0x0001	/* Been snarfed */
44#define	MD_PRV_DELETE		0x0002	/* Record pending to be deleted */
45#define	MD_PRV_COMMIT		0x0004	/* Record pending to be commited */
46#define	MD_PRV_CLEANUP		0x0008	/* Record pending to be cleaned up */
47#define	MD_PRV_CONVD		0x0010  /* Record has been converted (32->64) */
48#define	MD_PRV_PENDDEL		(MD_PRV_GOTIT | MD_PRV_DELETE)
49#define	MD_PRV_PENDCOM		(MD_PRV_GOTIT | MD_PRV_COMMIT)
50#define	MD_PRV_PENDCLEAN	(MD_PRV_GOTIT | MD_PRV_CLEANUP)
51
52
53#define	MDDB_E_INVALID	(-1)	/* an invalid argument was passed */
54#define	MDDB_E_EXISTS	(-2)	/* doing an operation a 2nd time which can */
55				/*	only be done once */
56#define	MDDB_E_MASTER	(-3)	/* problem occurred accessing mastor block */
57				/*	returned from NEW_DEV	*/
58#define	MDDB_E_TOOSMALL	(-4)	/* device is not large enough */
59#define	MDDB_E_NORECORD	(-5)	/* record does not exits */
60				/*
61				 *	returned from:	mddb_getnextrec
62				 *			mddb_getrecsize
63				 *			mddb_commitrec
64				 *			mddb_commitrecs
65				 *			mddb_deleterec
66				 */
67#define	MDDB_E_NOSPACE	(-6)	/* no space to create record */
68#define	MDDB_E_NOTNOW	(-7)	/* do not presently have enough resources */
69				/*	to perform requested operation */
70#define	MDDB_E_NODB	(-8)	/* no database exist */
71#define	MDDB_E_NOTOWNER (-9)	/* have not been told to grab this set */
72#define	MDDB_E_STALE	(-10)	/* database is stale */
73#define	MDDB_E_TOOFEW	(-11)	/* not enough replicas available */
74#define	MDDB_E_TAGDATA	(-12)	/* tagged data detected */
75#define	MDDB_E_ACCOK	(-13)	/* 50/50 mode */
76#define	MDDB_E_NTAGDATA	(-14)	/* tagop try, no tag data */
77#define	MDDB_E_ACCNOTOK	(-15)	/* accop try, no accept possible */
78#define	MDDB_E_NOLOCBLK	(-16)	/* No valid locators found */
79#define	MDDB_E_NOLOCNMS	(-17)	/* No valid locator name information */
80#define	MDDB_E_NODIRBLK	(-18)	/* No directory blocks found */
81#define	MDDB_E_NOTAGREC	(-19)	/* No tag record blocks found */
82#define	MDDB_E_NOTAG	(-20)	/* No matching tag record found */
83#define	MDDB_E_NODEVID	(-21)	/* No device id found */
84
85#define	MDDB_MINBLKS		16	/* enough for a few metadevices */
86#define	MDDB_MAXBLKS		8192	/* size of free bit map (must be / 8) */
87#define	MDDB_MN_MINBLKS		32768	/* Multinode metadb minimum size */
88					/* 16MB */
89#define	MDDB_MN_MAXBLKS		524288	/* size of free bit map (must be / 8) */
90					/* 256MB */
91
92#define	MDDB_C_STALE		0x0001
93#define	MDDB_C_TOOFEW		0x0002
94#define	MDDB_C_NOTOWNER		0x0004
95#define	MDDB_C_SET_MN_STALE	0x0008	/* Set MN set to stale */
96#define	MDDB_C_IMPORT		0x0010
97
98/*
99 * Defines used to set/reset new master flag in set structure.
100 * Used during reconfig cycle to determine quickly if there is
101 * new master for the set.
102 */
103#define	MDDB_NM_SET		0x0001
104#define	MDDB_NM_RESET		0x0002
105#define	MDDB_NM_GET		0x0004
106
107/* Definitions of flag in Locator Block Device ID data area - mddb_did_info */
108#define	MDDB_DID_EXISTS		0x0001	/* Device ID exists */
109#define	MDDB_DID_VALID		0x0002	/* Device ID valid on current system */
110#define	MDDB_DID_UPDATED	0x0004  /* locator/sidelocator info updated */
111
112/* Definitions of flag in Locator Block - mddb_lb */
113#define	MDDB_DEVID_STYLE	0x0001	/* Locator Block in Device ID format */
114#define	MDDB_MNSET		0x0002  /* MDDB is for a multi-node set */
115
116
117#define	MDDB_MAX_PATCH	25		/* number of locations that */
118					/*	can be patched in etc/system */
119
120/*
121 * Set struct used by all parts of the driver, to store anchor pointers.
122 *
123 * Lock associated with field in this structure:
124 *
125 * Some of fields are accessible by both the single threaded ioctl thread
126 * and internal threads such as resync, hotsparing...etc.  In this case
127 * additional protection is needed.  For example, s_db is protected by
128 * s_dbmx additionally and s_un, s_ui are protected by md_unit_array_rw.lock
129 * s_nm, s_nmid, s_did_nm and s_did_nmid and s_dtp are protected by nm_lock
130 * Rest of other fileds are protected by md_mx.  Two fields s_un_next and
131 * s_un_avail are introduced by the friendly name project and are ONLY
132 * accessible via a single threaded ioctl thread which already is protected
133 * by the ioctl lock and there is no need to add extra protection to them.
134 * However, in the future if they become accessible by other internal threads
135 * then an additional protection such as md_mx lock is highly recommended.
136 *
137 */
138typedef struct md_set {
139	uint_t		s_status;	/* set status */
140	void		**s_ui;		/* set unit incore anchor */
141	void		**s_un;		/* set unit anchor */
142	void		*s_hsp;		/* set Hot Spare Pool anchor */
143	void		*s_hs;		/* set Hot Spare anchor */
144	void		*s_db;		/* set MDDB anchor */
145	kmutex_t	s_dbmx;		/* set MDDB mutex */
146	void		*s_nm;		/* set namespace anchor */
147	mddb_recid_t	s_nmid;		/* set namespace anchor record */
148	void		*s_did_nm;	/* set device id namespace anchor */
149	mddb_recid_t	s_did_nmid;	/* set device id namespace anchor rec */
150	void		*s_dtp;		/* set data tag rec */
151	int		s_am_i_master;	/* incore master flag for this node */
152	md_mn_nodeid_t	s_nodeid;	/* nodeid of this node - for MN sets */
153	uint_t		s_rcnt;		/* incore resync count for set */
154	unit_t		s_un_next;	/* s_un scan starts here */
155	unit_t		s_un_avail;	/* number of avail slots */
156} md_set_t;
157
158
159#define	MDDB_MAGIC_MB	0x6d646d62	/* magic number for master blocks */
160#define	MDDB_MAGIC_DB	0x6d646462	/* magic number for directory blocks */
161#define	MDDB_MAGIC_RB	0x6d647262	/* magic number for record blocks */
162#define	MDDB_MAGIC_LB	0x6d646c62	/* magic number for locator blocks */
163#define	MDDB_MAGIC_LN	0x6d646c6e	/* magic number for locator names */
164#define	MDDB_MAGIC_DT	0x6d646474	/* magic number for data tag */
165#define	MDDB_MAGIC_DI	0x6d646469	/* magic number for device ID block */
166#define	MDDB_MAGIC_DU	0x6d646475	/* magic num for dummy mb */
167#define	MDDB_MAGIC_DE	0x6d646465	/* magic num for mb devid */
168
169#define	MDDB_GLOBAL_XOR 1234567890
170
171#define	MDDB_REV_MAJOR  (uint_t)0xff00
172#define	MDDB_REV_MINOR  (uint_t)0x00ff
173
174/*
175 * MDDB_REV_MNMB:
176 * If a MN diskset, master block revision is set to MDDB_REV_MNMB.
177 * Even though the master block structure is no different
178 * for a MN set, setting the revision field to a different
179 * number keeps any pre-MN_diskset code from accessing
180 * this diskset.  It also allows for an early determination
181 * of a MN diskset when reading in from disk so that the
182 * proper size locator block and locator names structure
183 * can be read in thus saving time on diskset startup.
184 * Since no change in master block structure, the MDDB_REV_MINOR
185 * portion of the revision was incremented.
186 *
187 * MDDB_REV_MNLB:
188 * If a MN diskset, the locator block structure is a different size in
189 * order to accomodate up to MD_MNMAXSIDES nodes in a diskset
190 * with any nodeid (sideno) allowed.
191 * The revision is set to MDDB_REV_MNLB which is a change of the
192 * MDDB_REV_MAJOR portion of the revision.
193 *
194 * MDDB_REV_MNLN:
195 * If a MN diskset, the locator names is a different size in
196 * order to accomodate up to MD_MNMAXSIDES nodes in a diskset
197 * with any nodeid (sideno) allowed.
198 * The revision is set to MDDB_REV_MNLN which is a change of the
199 * MDDB_REV_MAJOR portion of the revision.
200 *
201 * The record blocks have two binary properties.  A record block can
202 * represent either a 32 or 64 bit unit.  A record block can also represent
203 * a traditionally named unit or a friendly named unit.  Thus, there are
204 * minor revisions of record block.
205 *
206 *		Traditional		Friendly
207 *		Name			Name
208 *		-----------		--------
209 * 32 bit	MDDB_REV_RB		MDDB_REV_RBFN
210 * 64 bit	MDDB_REV_RB64		MDDB_REV_RB64FN
211 */
212
213#define	MDDB_REV_MB	(uint_t)0x0201
214#define	MDDB_REV_MNMB	(uint_t)0x0202
215#define	MDDB_REV_DB	(uint_t)0x0201
216#define	MDDB_REV_LB	(uint_t)0x0500
217#define	MDDB_REV_MNLB	(uint_t)0x0600
218#define	MDDB_REV_LN	(uint_t)0x0100
219#define	MDDB_REV_MNLN	(uint_t)0x0300
220#define	MDDB_REV_RB	(uint_t)0x0200
221#define	MDDB_REV_RB64	(uint_t)0x0201
222#define	MDDB_REV_RBFN	(uint_t)0x0202
223#define	MDDB_REV_RB64FN	(uint_t)0x0203
224#define	MDDB_REV_DT	(uint_t)0x0100
225#define	MDDB_REV_DI	(uint_t)0x0100
226
227/*
228 * Transfer record block friendly name status to unit/hs structure.
229 */
230#define	MDDB_NOTE_FN(rbv, unv)	switch (rbv) { \
231				case MDDB_REV_RB: \
232				case MDDB_REV_RB64: \
233					unv &= ~MD_FN_META_DEV; \
234					break; \
235				case MDDB_REV_RBFN: \
236				case MDDB_REV_RB64FN: \
237					unv |= MD_FN_META_DEV; \
238					break;	\
239				}
240
241#define	MDDB_BSIZE	(uint_t)DEV_BSIZE
242#define	MDDB_PREFIXCNT	10
243#define	MDDB_DRVNMCNT   10
244
245typedef int	mddb_block_t;
246
247#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
248#pragma pack(4)
249#endif
250typedef struct md_mnname_suffix {
251	md_name_suffix	mn_ln_suffix;
252	uint_t		mn_ln_sideno;
253} md_mnname_suffix_t;
254
255typedef	struct mddb_ln {
256	int			ln_magic;
257	uint_t			ln_revision;
258	uint_t			ln_checksum;
259	struct timeval32	ln_timestamp;
260	md_name_prefix		ln_prefixes[MDDB_PREFIXCNT];
261	/* Don't change array sizes without changing RNDUP_BLKCNT */
262	md_name_suffix		ln_suffixes[MD_MAXSIDES][MDDB_NLB];
263} mddb_ln_t;
264
265/*
266 * Locator name structure for MN diskset.  Same as for traditional
267 * and local diskset except that more sides are supported and the
268 * side number can be any number since the side number is stored
269 * in the ln_mnsuffixes structure instead of being used as an index
270 * into that array.  This means that the whole array may need to be
271 * searched in order to find the correct information given a side number.
272 */
273typedef	struct mddb_mnln {
274	int			ln_magic;
275	uint_t			ln_revision;
276	uint_t			ln_checksum;
277	struct timeval32	ln_timestamp;
278	md_name_prefix		ln_prefixes[MDDB_PREFIXCNT];
279	/* Don't change array sizes without changing MDDB_MNLNCNT */
280	md_mnname_suffix_t	ln_mnsuffixes[MD_MNMAXSIDES][MDDB_NLB];
281} mddb_mnln_t;
282
283#define	RNDUP_BLKCNT(sz, delta)	(((sz) - \
284				    ((delta) * \
285				    ((MD_MAXSIDES  - 1) * MDDB_NLB)) + \
286				    MDDB_BSIZE - 1) / MDDB_BSIZE)
287#define	MDDB_LNCNT		RNDUP_BLKCNT(sizeof (mddb_ln_t), 0)
288#define	MDDB_LOCAL_LNCNT	RNDUP_BLKCNT(sizeof (mddb_ln_t), \
289				    sizeof (md_name_suffix))
290
291#define	MDDB_MNLNCNT		((sizeof (mddb_mnln_t) + (MDDB_BSIZE - 1)) \
292				    / MDDB_BSIZE)
293
294typedef struct mddb_dt {
295	uint_t		dt_mag;
296	uint_t		dt_rev;
297	uint_t		dt_cks;
298	mddb_dtag_t	dt_dtag;
299} mddb_dt_t;
300
301#define	MDDB_DT_BYTES	(roundup(sizeof (mddb_dt_t), MDDB_BSIZE))
302#define	MDDB_DT_BLOCKS	(btodb(MDDB_DT_BYTES))
303
304typedef union identifier {
305	char			serial[MDDB_SN_LEN];
306	struct timeval32	createtime;
307} identifier_t;
308
309typedef struct mddb_locator {
310	dev32_t		l_dev;
311	daddr32_t	l_blkno;
312	int		l_flags;
313} mddb_locator_t;
314
315typedef struct mddb_sidelocator {
316	uchar_t		l_drvnm_index;
317	minor_t		l_mnum;
318} mddb_sidelocator_t;
319
320typedef struct mddb_mnsidelocator {
321	uchar_t		mnl_drvnm_index;
322	minor_t		mnl_mnum;
323	uint_t		mnl_sideno;
324} mddb_mnsidelocator_t;
325
326typedef struct mddb_drvnm {
327	uchar_t		dn_len;
328	char		dn_data[MD_MAXDRVNM];
329} mddb_drvnm_t;
330
331/*
332 * Locator Block Device ID Information
333 * Several device id's may share one disk block in an effort to
334 * conserve used replica space.
335 */
336typedef struct mddb_did_info {
337	uint_t		info_flags;	/* MDDB Device ID flags */
338	uint_t		info_firstblk;	/* Device ID Start Block */
339	uint_t		info_blkcnt;	/* Device ID Block Count */
340	uint_t		info_offset;	/* Device ID offset w/i Block */
341	uint_t		info_length;	/* Device ID Length */
342	uint_t		info_checksum;	/* Device ID Checksum */
343	char		info_minor_name[32]; /* Minor name of lb dev */
344} mddb_did_info_t;
345
346typedef struct mddb_did_blk {
347	int		blk_magic;	/* used for verification */
348	uint_t		blk_revision;	/* used for verification */
349	int		blk_checksum;	/* used for verification */
350	uint_t		blk_commitcnt;	/* matches LB's commitcnt */
351	mddb_did_info_t	blk_info[MDDB_NLB];
352} mddb_did_blk_t;
353#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
354#pragma pack()
355#endif
356
357#define	MDDB_DID_BYTES	(roundup(sizeof (mddb_did_blk_t), MDDB_BSIZE))
358#define	MDDB_DID_BLOCKS	(btodb(MDDB_DID_BYTES))
359
360/*
361 * Device ID Disk Blocks.
362 * Incore linked list of disk blocks containing device IDs.
363 * The list is built when reading in the mddb_did_blk structure and
364 * when reading in the actual disk blocks containing device ids.
365 * This list is used to easily write out all disk blocks containing
366 * device ids.
367 */
368typedef struct mddb_did_db {
369	uint_t		db_firstblk;	/* Disk Block's logical addr */
370	uint_t		db_blkcnt;	/* Contig Disk Block Count */
371	caddr_t		db_ptr;		/* Ptr to incore Block(s) */
372	struct mddb_did_db	*db_next;	/* Ptr to next in list */
373} mddb_did_db_t;
374
375/*
376 * Device ID Free List.
377 * Incore linked list of free space in disk blocks containing device IDs.
378 * Used to manage placement of device IDs in disk blocks.
379 * All disk blocks on free list are also in linked list of disk block
380 * containing device IDs (mddb_did_db_t).
381 */
382typedef struct mddb_did_free {
383	uint_t			free_blk;	/* Disk Block's logical addr */
384	uint_t			free_offset;	/* offset of free space */
385	uint_t			free_length;	/* length of free space */
386	struct mddb_did_free	*free_next;	/* Ptr to next in list */
387} mddb_did_free_t;
388
389/*
390 * Device ID Incore Area
391 *    Contains pointer to Device ID Disk Block list and
392 *         Device ID Free List.
393 *    Also contains incore array of pointers to device IDs.  Pointers
394 *    point into the device ID Disk Block list and are used as a
395 *    shortcut to find incore device IDs.
396 */
397typedef struct mddb_did_ic {
398	mddb_did_blk_t	*did_ic_blkp;
399	mddb_did_db_t	*did_ic_dbp;
400	mddb_did_free_t	*did_ic_freep;
401	ddi_devid_t	did_ic_devid[MDDB_NLB]; /* Ptr to device IDs */
402} mddb_did_ic_t;
403
404/*
405 * Locator Block (LB):
406 *	- Are fixed size, but the size is different
407 *		for local/shared set db replicas.
408 *	- All LB's start at logical block 0.
409 * 	- After a replica quorum is found, there is
410 *	  is only one incore copy of the LB.
411 *	- LB's are only written when replicas are added, deleted, or errored.
412 *	- LB's provide information about other replica's and their state.
413 */
414#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
415#pragma pack(4)
416#endif
417typedef struct mddb_lb {
418	int			lb_magic;	/* used for verification */
419	uint_t			lb_revision;	/* used for verification */
420	int			lb_checksum;	/* used for verification */
421	uint_t			lb_commitcnt;	/* IMPORTANT */
422	struct timeval32	lb_timestamp;	/* informative only */
423	int			lb_loccnt;	/* used for verification */
424	identifier_t		lb_ident;	/* used for verification */
425	uint_t			lb_flags;	/* flags describing LB */
426	uint_t			lb_spare[8];	/* Spare/Pad */
427	mddb_block_t		lb_didfirstblk;	/* Devid Array Start Block */
428	mddb_block_t		lb_didblkcnt;	/* Devid Array Number Blocks */
429	mddb_block_t		lb_dtfirstblk;	/* Data Tag Start Block */
430	mddb_block_t		lb_dtblkcnt;	/* Data Tag Number Block(s) */
431	struct timeval32	lb_inittime;	/* creation of database */
432	set_t			lb_setno;	/* used for verification */
433	mddb_block_t		lb_blkcnt;	/* used for verification */
434	mddb_block_t		lb_lnfirstblk;
435	mddb_block_t		lb_lnblkcnt;
436	mddb_block_t		lb_dbfirstblk;
437	mddb_drvnm_t		lb_drvnm[MDDB_DRVNMCNT];
438	mddb_locator_t		lb_locators[MDDB_NLB];
439	/* Don't change array sizes without changing RNDUP_BLKCNT */
440	mddb_sidelocator_t	lb_sidelocators[MD_MAXSIDES][MDDB_NLB];
441} mddb_lb_t;
442#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
443#pragma pack()
444#endif
445
446/*
447 * Locator block structure for MN diskset.  Same as for traditional
448 * and local diskset except that more sides are supported and the
449 * side number can be any number since the side number is stored
450 * in the lb_mnsidelocators structure instead of being used as an index
451 * into that array.  This means that the whole array may need to be
452 * searched in order to find the correct information given a side number.
453 */
454typedef struct mddb_mnlb {
455	int			lb_magic;	/* used for verification */
456	uint_t			lb_revision;	/* used for verification */
457	int			lb_checksum;	/* used for verification */
458	uint_t			lb_commitcnt;	/* IMPORTANT */
459	struct timeval32	lb_timestamp;	/* informative only */
460	int			lb_loccnt;	/* used for verification */
461	identifier_t		lb_ident;	/* used for verification */
462	uint_t			lb_flags;	/* flags describing LB */
463	uint_t			lb_spare[8];	/* Spare/Pad */
464	mddb_block_t		lb_didfirstblk;	/* Devid Array Start Block */
465	mddb_block_t		lb_didblkcnt;	/* Devid Array Number Blocks */
466	mddb_block_t		lb_dtfirstblk;	/* Data Tag Start Block */
467	mddb_block_t		lb_dtblkcnt;	/* Data Tag Number Block(s) */
468	struct timeval32	lb_inittime;	/* creation of database */
469	set_t			lb_setno;	/* used for verification */
470	mddb_block_t		lb_blkcnt;	/* used for verification */
471	mddb_block_t		lb_lnfirstblk;
472	mddb_block_t		lb_lnblkcnt;
473	mddb_block_t		lb_dbfirstblk;
474	mddb_drvnm_t		lb_drvnm[MDDB_DRVNMCNT];
475	mddb_locator_t		lb_locators[MDDB_NLB];
476	/* Don't change array sizes without changing MDDB_MNLBCNT */
477	mddb_mnsidelocator_t	lb_mnsidelocators[MD_MNMAXSIDES][MDDB_NLB];
478} mddb_mnlb_t;
479
480
481#define	MDDB_LBCNT		RNDUP_BLKCNT(sizeof (mddb_lb_t), 0)
482#define	MDDB_LOCAL_LBCNT	RNDUP_BLKCNT(sizeof (mddb_lb_t), \
483				    sizeof (mddb_sidelocator_t))
484
485#define	MDDB_MNLBCNT		((sizeof (mddb_mnlb_t) + (MDDB_BSIZE - 1)) \
486				    / MDDB_BSIZE)
487
488typedef struct mddb_map {
489	daddr32_t		m_consecutive;
490	daddr32_t		m_firstblk;
491} mddb_map_t;
492
493/*
494 * Master block(s) (MB)
495 * 	- Are written by userland; Never by the driver!
496 *	- Each replica has there own master blocks,
497 *		the master block(s) are not shared.
498 *	- MB's are not in the logical block address space of the database.
499 *	- MB's are a fixed size record (MDDB_BSIZE)
500 *	- MB's provide the logical to physical block translation,
501 *		for their replica.
502 */
503typedef	struct mddb_mb {
504	int			mb_magic;	/* used for verification */
505	uint_t			mb_revision;	/* used for verification */
506	uint_t			mb_checksum;	/* used for verification */
507#ifdef _LP64
508	uint32_t		mb_next;	/* incore to next mb */
509#else
510	struct mddb_mb		*mb_next;	/* incore to next mb */
511#endif	/* _LP64 */
512	daddr32_t		mb_nextblk;	/* block # for next mb */
513	md_timeval32_t		mb_timestamp;	/* timestamp */
514	daddr32_t		mb_blkcnt;	/* size of blkmap */
515	daddr32_t		mb_blkno;	/* physical loc. for this MB */
516	set_t			mb_setno;	/* used for verification */
517	struct timeval32	mb_setcreatetime; /* set creation timestamp */
518	int			spares[7];
519	mddb_map_t		mb_blkmap;	/* logical->physical blk map */
520	int			mb_devid_magic;	/* verify devid in mb */
521	short			mb_devid_len;	/* len of following devid */
522	char			mb_devid[1];	/* devid byte array */
523} mddb_mb_t;
524
525/*
526 * In-core version of mddb_mb. It is known that the mddb_mb is 512 bytes on
527 * disk, really, and so this structure is 512 + sizeof(struct mddb_mb_ic *)
528 */
529#define	MDDB_IC_BSIZE	(MDDB_BSIZE + sizeof (struct mddb_mb_ic *))
530typedef struct mddb_mb_ic {
531	struct mddb_mb_ic 	*mbi_next;
532	struct mddb_mb		mbi_mddb_mb;
533} mddb_mb_ic_t;
534
535
536/*
537 * there can be no address in record block. The checksum must
538 * stay the same where ever the record is in memory. Many
539 * things depend on this. Also the timestamp is the time the the
540 * record was committed not the time it was written to a particular
541 * device.
542 *
543 * Old definition of mddb_rb, for 32-bit apps and libraries
544 */
545typedef struct mddb_rb {
546	uint_t			rb_magic;
547	uint_t			rb_revision;
548	uint_t			rb_checksum;
549	uint_t			rb_checksum_fiddle;
550	uint_t			rb_private;
551	void			*rb_userdata;
552	uint_t			rb_commitcnt;
553	uint_t			rb_spare[1];
554	struct timeval32	rb_timestamp;
555	int			rb_data[1];
556} mddb_rb_t;
557
558/* This is, and always will be, the on-disk version of mddb_rb */
559typedef struct mddb_rb32 {
560	uint_t			rb_magic;
561	uint_t			rb_revision;
562	uint_t			rb_checksum;
563	uint_t			rb_checksum_fiddle;
564	uint_t			rb_private;
565	uint32_t		rb_userdata;
566	uint_t			rb_commitcnt;
567	uint_t			rb_spare[1];
568	struct timeval32	rb_timestamp;
569	int			rb_data[1];
570} mddb_rb32_t;
571
572/*
573 * directory entries
574 */
575typedef struct mddb_optinfo {
576	int		o_li;
577	int		o_flags;
578} mddb_optinfo_t;
579
580/* Old definition of mddb_de, for 32-bit apps and libraries */
581typedef struct mddb_de {
582	struct mddb_de	*de_next;
583	mddb_rb_t	*de_rb;
584	mddb_recid_t	de_recid;
585	mddb_type_t	de_type1;
586	uint_t		de_type2;
587	uint_t		de_reqsize;
588	uint_t		de_recsize;
589	mddb_block_t	de_blkcount;
590	uint_t		de_flags;
591	mddb_optinfo_t	de_optinfo[2];
592	mddb_block_t	de_blks[1];
593} mddb_de_t;
594
595/*
596 * In core version of mddb_de, includes pointer for mddb_rb32_t user data
597 * mddb_rb32_t is used incore
598 */
599typedef struct mddb_de_ic {
600	void			*de_rb_userdata;
601	void			*de_rb_userdata_ic;
602	uint_t			de_owner_nodeid;
603	struct mddb_de_ic	*de_next;
604	mddb_rb32_t		*de_rb;
605	mddb_recid_t		de_recid;
606	mddb_type_t		de_type1;
607	uint_t			de_type2;
608	size_t			de_reqsize;
609	size_t			de_icreqsize;
610	size_t			de_recsize;
611	uint_t			de_blkcount;
612	uint_t			de_flags;
613	mddb_optinfo_t		de_optinfo[2];
614	mddb_block_t		de_blks[1];
615} mddb_de_ic_t;
616
617typedef struct mddb_db {
618	uint_t			db_magic;
619	uint_t			db_revision;
620	uint_t			db_checksum;
621	mddb_block_t		db_blknum;
622	struct mddb_db		*db_next;
623	mddb_block_t		db_nextblk;
624	struct timeval32	db_timestamp;
625	uint_t			db_recsum;
626#ifdef _KERNEL
627	mddb_de_ic_t		*db_firstentry;
628#else
629	mddb_de_t		*db_firstentry;
630#endif
631} mddb_db_t;
632
633/*
634 * This is, and always will be, the on-disk version of mddb_de
635 * When mddb_de32 is read in it is converted into mddb_de_ic
636 */
637typedef struct mddb_de32 {
638	uint32_t	de32_next;
639	uint32_t	de32_rb;
640	mddb_recid_t	de32_recid;
641	mddb_type_t	de32_type1;
642	uint_t		de32_type2;
643	uint_t		de32_reqsize;
644	uint_t		de32_recsize;
645	mddb_block_t	de32_blkcount;
646	uint_t		de32_flags;
647	mddb_optinfo_t	de32_optinfo[2];
648	mddb_block_t	de32_blks[1];
649} mddb_de32_t;
650
651/*
652 * This is, and always will be, the on-disk version of mddb_db
653 * When mddb_db32 is read in it is converted into mddb_db
654 * To minimize impact on mddb format mddb_db fileds remain intact
655 */
656typedef struct mddb_db32 {
657	uint_t			db32_magic;
658	uint_t			db32_revision;
659	uint_t			db32_checksum;
660	mddb_block_t		db32_blknum;
661	uint32_t		db32_next;
662	mddb_block_t		db32_nextblk;
663	struct timeval32	db32_timestamp;
664	uint_t			db32_recsum;
665	uint32_t		db32_firstentry;
666} mddb_db32_t;
667
668#define	de32tode(from, to) \
669	{ \
670	int i; \
671	to->de_rb_userdata = NULL; \
672	to->de_owner_nodeid = MD_MN_INVALID_NID; \
673	to->de_next = (struct mddb_de_ic *)(uintptr_t)from->de32_next; \
674	to->de_rb = (mddb_rb32_t *)(uintptr_t)from->de32_rb; \
675	to->de_recid =  from->de32_recid; \
676	to->de_type1 =  from->de32_type1; \
677	to->de_type2 =  from->de32_type2; \
678	to->de_reqsize =  from->de32_reqsize; \
679	to->de_recsize =  from->de32_recsize; \
680	to->de_blkcount =  from->de32_blkcount; \
681	to->de_flags =  from->de32_flags; \
682	to->de_optinfo[0] =  from->de32_optinfo[0]; \
683	to->de_optinfo[1] =  from->de32_optinfo[1]; \
684	for (i = 0; i < from->de32_blkcount; i++) \
685		to->de_blks[i] = from->de32_blks[i]; \
686	}
687
688#define	detode32(from, to) \
689	{ \
690	int i; \
691	to->de32_next = (uint32_t)(uintptr_t)from->de_next; \
692	to->de32_rb = (uint32_t)(uintptr_t)from->de_rb; \
693	to->de32_recid =  from->de_recid; \
694	to->de32_type1 =  from->de_type1; \
695	to->de32_type2 =  from->de_type2; \
696	to->de32_reqsize =  from->de_reqsize; \
697	to->de32_recsize =  from->de_recsize; \
698	to->de32_blkcount =  from->de_blkcount; \
699	to->de32_flags =  from->de_flags; \
700	to->de32_optinfo[0] =  from->de_optinfo[0]; \
701	to->de32_optinfo[1] =  from->de_optinfo[1]; \
702	for (i = 0; i < from->de_blkcount; i++) \
703		to->de32_blks[i] = from->de_blks[i]; \
704	}
705
706#define	db32todb(from, to) \
707	to->db_magic = from->db32_magic; \
708	to->db_revision = from->db32_revision; \
709	to->db_checksum = from->db32_checksum; \
710	to->db_blknum = from->db32_blknum; \
711	to->db_next = (struct mddb_db *)(uintptr_t)from->db32_next; \
712	to->db_nextblk = from->db32_nextblk; \
713	to->db_timestamp = from->db32_timestamp; \
714	to->db_recsum = from->db32_recsum; \
715	to->db_firstentry = (mddb_de_ic_t *)(uintptr_t)from->db32_firstentry;
716
717#define	dbtodb32(from, to) \
718	to->db32_magic = from->db_magic; \
719	to->db32_revision = from->db_revision; \
720	to->db32_checksum = from->db_checksum; \
721	to->db32_blknum = from->db_blknum; \
722	to->db32_next = (uint32_t)(uintptr_t)from->db_next; \
723	to->db32_nextblk = from->db_nextblk; \
724	to->db32_timestamp = from->db_timestamp; \
725	to->db32_recsum = from->db_recsum; \
726	to->db32_firstentry = (uint32_t)(uintptr_t)from->db_firstentry;
727
728/*
729 * information about a replica of the data base
730 */
731typedef struct mddb_ri {
732	struct mddb_ri		*ri_next;
733	uint_t			ri_flags;
734	uint_t			ri_commitcnt;
735	int			ri_transplant;
736	md_dev64_t		ri_dev;
737	daddr32_t		ri_blkno;
738	char			ri_driver[16];
739	mddb_mb_ic_t		*ri_mbip;
740	mddb_lb_t		*ri_lbp;
741	mddb_dt_t		*ri_dtp;
742	mddb_did_ic_t		*ri_did_icp;
743	ddi_devid_t		ri_devid;
744	ddi_devid_t		ri_old_devid;
745	char			ri_minor_name[MDDB_MINOR_NAME_MAX];
746	char			ri_devname[MAXPATHLEN];
747} mddb_ri_t;
748
749typedef struct mddb_bf {
750	struct mddb_bf	*bf_next;
751	mddb_locator_t	*bf_locator;
752	buf_t		bf_buf;
753} mddb_bf_t;
754
755/*
756 * Information for sets of databases (which include replicas)
757 */
758#define	MDDB_BITSRECID	31
759#define	MDDB_SETSHIFT	(MDDB_BITSRECID - MD_BITSSET)
760#define	MDDB_SETMASK	(MD_SETMASK << MDDB_SETSHIFT)
761#define	MDDB_RECIDMASK	((1 << MDDB_SETSHIFT) - 1)
762
763#define	DBSET(id)	(((id) & MDDB_SETMASK) >> MDDB_SETSHIFT)
764#define	DBID(id)	((id) & MDDB_RECIDMASK)
765#define	MAKERECID(s, i)	((((s) << MDDB_SETSHIFT) & MDDB_SETMASK) | \
766			((i) & MDDB_RECIDMASK))
767
768#define	MDDB_PARSE_LOCBLK	0x00000001
769#define	MDDB_PARSE_LOCNM	0x00000002
770#define	MDDB_PARSE_OPTRECS	0x00000004
771#define	MDDB_PARSE_MASK		0x0000000F
772
773
774#define	MDDB_BLOCK_PARSE	0x00000001	/* Block sending parse msgs */
775#define	MDDB_UNBLOCK_PARSE	0x00000002	/* Unblock sending parse msgs */
776
777/*
778 * We need to keep s_ident and s_inittime 32 bit.  They are used in mddb_lb
779 */
780typedef struct mddb_set {
781	uint_t		s_setno;		/* set number */
782	uint_t		s_sideno;		/* side number */
783	identifier_t	s_ident;		/* set identifier */
784	char		*s_setname;		/* set name */
785	mddb_mb_ic_t	**s_mbiarray;		/* master blocks array */
786	mddb_db_t	*s_dbp;			/* directory block */
787	mddb_lb_t	*s_lbp;			/* locator block */
788						/* May be cast to mddb_mnlb_t */
789						/* if accessing sidenames in */
790						/* MN diskset */
791	mddb_ln_t	*s_lnp;			/* locator names block */
792						/* May be cast to mddb_mnln_t */
793						/* if accessing sidenames in */
794						/* MN diskset */
795	mddb_dtag_lst_t	*s_dtlp;		/* List of data tags found */
796	mddb_did_ic_t	*s_did_icp;		/* Device ID incore area */
797	mddb_ri_t	*s_rip;			/* replicas incore list */
798	int		s_freeblkcnt;		/* visable for test code */
799	int		s_totalblkcnt;		/* visable for test code */
800	int		s_mn_parseflags;	/* mddb parse flags for MNset */
801	int		s_mn_parseflags_sending; /* parse flgs sent to slaves */
802	uchar_t		*s_freebitmap;		/* free blocks bitmap */
803	uint_t		s_freebitmapsize;	/* size of bitmap */
804	struct timeval32	s_inittime;	/* timestamp set created */
805	mddb_recid_t	s_zombie;		/* zombie record - createrec */
806	int		s_staledeletes;		/* number of stale deleterec */
807	int		s_optcmtcnt;		/* Following are opt. record */
808	int		s_opthavelck;		/*   bookkeeping records ... */
809	int		s_optwantlck;
810	kcondvar_t	s_optwantlck_cv;
811	int		s_optwaiterr;
812	int		s_opthungerr;
813	kcondvar_t	s_opthungerr_cv;
814	int		s_opthavequeuinglck;
815	int		s_optwantqueuinglck;
816	kcondvar_t	s_optqueuing_cv;
817	ulong_t		s_bufmisses;
818	mddb_bf_t	*s_freebufhead;
819	int		s_bufwakeup;
820	kcondvar_t	s_buf_cv;
821	size_t		s_databuffer_size;
822	void		*s_databuffer;
823	int		s_singlelockgotten;
824	int		s_singlelockwanted;
825	kcondvar_t	s_single_thread_cv;
826	md_hi_arr_t	s_med;
827} mddb_set_t;
828
829#ifndef MDDB_FAKE
830#ifdef _KERNEL
831/* md_mddb.c */
832extern uint_t			mddb_lb_did_convert(mddb_set_t *,
833				    uint_t, uint_t *);
834extern void			mddb_locatorblock2splitname(mddb_ln_t *,
835				    int, side_t, md_splitname *);
836extern int			mddb_configure(mddb_cfgcmd_t,
837				    struct mddb_config *);
838extern mddb_recid_t		mddb_getnextrec(mddb_recid_t,
839				    mddb_type_t, uint_t);
840extern int			mddb_getoptloc(mddb_optloc_t *);
841extern void			*mddb_getrecaddr(mddb_recid_t);
842extern void			*mddb_getrecaddr_resize(mddb_recid_t, size_t,
843				    off_t);
844extern int			mddb_getrecprivate(mddb_recid_t);
845extern void			mddb_setrecprivate(mddb_recid_t, uint_t);
846extern mddb_de_ic_t		*mddb_getrecdep(mddb_recid_t);
847extern mddb_type_t		mddb_getrectype1(mddb_recid_t);
848extern int			mddb_getrectype2(mddb_recid_t);
849extern int			mddb_getrecsize(mddb_recid_t);
850extern int			mddb_commitrec(mddb_recid_t);
851extern int			mddb_commitrecs(mddb_recid_t *);
852extern int			mddb_deleterec(mddb_recid_t);
853extern mddb_recstatus_t		mddb_getrecstatus(mddb_recid_t);
854extern mddb_recid_t		mddb_createrec(size_t usersize,
855				    mddb_type_t type, uint_t type2,
856				    md_create_rec_option_t option, set_t setno);
857extern void			mddb_init(void);
858extern void			mddb_unload(void);
859extern void			mddb_unload_set(set_t setno);
860extern mddb_recid_t		mddb_makerecid(set_t setno, mddb_recid_t id);
861extern set_t			mddb_getsetnum(mddb_recid_t id);
862extern char			*mddb_getsetname(set_t setno);
863extern side_t			mddb_getsidenum(set_t setno);
864extern int			mddb_ownset(set_t setno);
865extern int			getmed_ioctl(mddb_med_parm_t *medpp, int mode);
866extern int			setmed_ioctl(mddb_med_parm_t *medpp, int mode);
867extern int			updmed_ioctl(mddb_med_upd_parm_t *medpp,
868				    int mode);
869extern int			take_set(mddb_config_t *cp, int mode);
870extern int			release_set(mddb_config_t *cp, int mode);
871extern int			gettag_ioctl(mddb_dtag_get_parm_t *dtgpp,
872				    int mode);
873extern int			usetag_ioctl(mddb_dtag_use_parm_t *dtupp,
874				    int mode);
875extern int			accept_ioctl(mddb_accept_parm_t *medpp,
876				    int mode);
877extern int			md_update_locator_namespace(set_t setno,
878				    side_t side, char *dname, char *pname,
879				    md_dev64_t devt);
880extern int			mddb_validate_lb(set_t setno, int *rmaxsz);
881extern int			mddb_getinvlb_devid(set_t setno, int count,
882				    int size, char **ctdptr);
883extern int			md_update_minor(set_t, side_t, mdkey_t);
884extern int			md_update_nm_rr_did_ioctl(mddb_config_t *cp);
885extern int			md_update_top_device_minor(set_t, side_t,
886				    md_dev64_t);
887#ifdef DEBUG
888extern void			mddb_check(void);
889#endif /* DEBUG */
890#endif /* _KERNEL */
891
892#else
893
894caddr_t mddb_fakeit;
895
896#define	md_lb_did_convert(a, b, c)	(0)
897#define	mddb_configure(a, b)	(0)
898#define	mddb_getnextrec(a, b, c)		((mddb_recid_t)0)
899#define	mddb_getrecaddr(a)	(mddb_fakeit)
900#define	mddb_getrecprivate(a)	(0)
901#define	mddb_setrecprivate(a, b) (0)
902#define	mddb_getrectype1(a)	(0)
903#define	mddb_getrectype2(a)	(0)
904#define	mddb_getrecsize(a)	(0)
905#define	mddb_commitrec(a)	(0)
906#define	mddb_commitrecs(a)	(0)
907#define	mddb_deleterec(a)	(0)
908#define	mddb_getrecstatus(a)	(MDDB_OK)
909#define	mddb_createrec(s, a, b)	(0xffff & (int)(mddb_fakeit = \
910					(caddr_t)kmem_zalloc(s, KM_SLEEP)))
911#define	mddb_unload()		(0)
912
913#endif
914
915#define	MDDB_NOSLEEP	1
916#define	MDDB_SLEEPOK	0
917
918#define	MDDB_NOOLDOK	0x1
919#define	MDDB_MUSTEXIST	0x2
920#define	MDDB_NOINIT	0x4
921#define	MDDB_MULTINODE	0x8
922#define	MDDB_MN_STALE	0x10	/* MN set is stale */
923
924/* Flags passed to selectreplicas - not a bit mask */
925#define	MDDB_SCANALL		1
926#define	MDDB_RETRYSCAN		0
927#define	MDDB_SCANALLSYNC	2	/* During reconfig, sync up incore */
928					/* and ondisk mddb by writing incore */
929					/* values to disk.  Don't write */
930					/* change log records. */
931
932/* Flags passed to writestart and writecopy */
933#define	MDDB_WRITECOPY_ALL	1	/* Write all incore mddb to disk */
934#define	MDDB_WRITECOPY_SYNC	2	/* Write incore mddb to disk except */
935					/* 	- change log records */
936					/*	- optimized resync records */
937
938
939#define	MDDB_PROBE	1
940#define	MDDB_NOPROBE	0
941
942
943/*
944 * MN diskset definitions used to determine if a slave can write
945 * directly to the mddb.  ONLY_MASTER only allows the master node
946 * to write to the mddb.  ANY_NODE allows any node to write
947 * to the mddb.
948 */
949#define	MDDB_WR_ONLY_MASTER	0
950#define	MDDB_WR_ANY_NODE	1
951
952#define	MDDB_L_LOCKED	0x0001	/* this record is locked */
953#define	MDDB_L_WANTED	0x0002
954
955#ifdef	__cplusplus
956}
957#endif
958
959#endif	/* _SYS_MD_MDDB_H */
960